remove CompilerPair (#14638)

2026-06-24 02:14:17 +00:00 · 2026-02-09 16:51:18 -08:00 · 2026-02-09 16:51:18 -08:00 · e6562a5061
commit e6562a5061
parent 396e1320fb
21 changed files with 112 additions and 147 deletions
--- a/test/device/test_metal.py
+++ b/test/device/test_metal.py
@ -1,5 +1,5 @@
 import unittest
-from tinygrad.device import CompileError, Device, Compiler
+from tinygrad.device import CompileError, Device
 if Device.DEFAULT=="METAL":
  from tinygrad.runtime.ops_metal import MetalDevice, MetalCompiler, MetalProgram
@unittest.skipIf(Device.DEFAULT!="METAL", "Metal support required")
@ -48,28 +48,4 @@ kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgr
 """)
    with self.assertRaises(RuntimeError):
      compiled = compiled[:40] # corrupt the compiled program
-      MetalProgram(device, "r_5", compiled)
-
-  def test_program_w_empty_compiler(self):
-    device = MetalDevice("metal")
-    compiler = Compiler(device)
-    compiled = compiler.compile("""
-#include <metal_stdlib>
-kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]){
-  data0[0] = 0;
-}
-""")
-    MetalProgram(device, "r_5", compiled)
-
-  def test_bad_program_w_empty_compiler(self):
-    device = MetalDevice("metal")
-    compiler = Compiler(device)
-    # this does not raise
-    compiled = compiler.compile("""
-#include <metal_stdlib>
-kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]){
-  invalid codes;
-}
-""")
-    with self.assertRaises(RuntimeError):
      MetalProgram(device, "r_5", compiled)
--- a/test/device/test_ocl.py
+++ b/test/device/test_ocl.py
@ -1,9 +1,19 @@
 import unittest
+from unittest.mock import patch
 from tinygrad import Device
 from tinygrad.device import Buffer
 from tinygrad.dtype import dtypes
 from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram

+@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
+class TestCLCompileCache(unittest.TestCase):
+  def test_compile_cached(self):
+    device = Device[Device.DEFAULT]
+    src = "__kernel void cached_test(__global int* a) { a[0] = 1; }"
+    CLProgram(device, name="cached_test", lib=src.encode())
+    with patch.object(CLCompiler, 'compile', side_effect=RuntimeError("compile should not be called on cache hit")):
+      CLProgram(device, name="cached_test", lib=src.encode())
+
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
 class TestCLError(unittest.TestCase):
  @unittest.skip("allocates tons of memory")
@ -17,7 +27,7 @@ class TestCLError(unittest.TestCase):
  def test_invalid_kernel_name(self):
    device = Device[Device.DEFAULT]
    with self.assertRaises(RuntimeError) as err:
-      CLProgram(device, name="", lib=CLCompiler(device, "test").compile("__kernel void test(__global int* a) { a[0] = 1; }"))
+      CLProgram(device, name="", lib="__kernel void test(__global int* a) { a[0] = 1; }".encode())
    assert str(err.exception) == "OpenCL Error -46: CL_INVALID_KERNEL_NAME"

  def test_unaligned_copy(self):
--- a/tinygrad/codegen/init.py
+++ b/tinygrad/codegen/init.py
@ -139,7 +139,6 @@ def do_render(ctx:Renderer, prg:UOp, lin:UOp) -> UOp:
  return prg.replace(src=prg.src + (UOp(Ops.SOURCE, arg=src),), arg=ctx.aux(list(lin.src)) if ctx.has_aux else prg.arg)

 def do_compile(ctx:Renderer, prg:UOp, source:UOp) -> UOp|None:
-  if ctx.compiler is None: return None
  lib = ctx.compiler.compile_cached(source.arg)
  return prg.replace(src=prg.src + (UOp(Ops.BINARY, arg=lib),))

--- a/tinygrad/device.py
+++ b/tinygrad/device.py
@ -1,14 +1,14 @@
 from __future__ import annotations
 from dataclasses import dataclass, replace
 from collections import defaultdict
-from typing import Any, Generic, TypeVar, Iterator, Generator
+from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
 import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
 from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
 from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
 from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
 from tinygrad.helpers import EMULATED_DTYPES
 from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
-from tinygrad.renderer import Renderer
+if TYPE_CHECKING: from tinygrad.renderer import Renderer

 # **************** Device ****************

@ -278,39 +278,34 @@ class Compiler:
  def disassemble(self, lib:bytes): pass

@dataclass(frozen=True)
-class CompilerPair:
-  renderer:type[Renderer]|functools.partial; compiler:type[Compiler]|functools.partial|None = None; ctrl_var:ContextVar|None = None # noqa: E702
-  name:str|None = None
-
-@dataclass(frozen=True)
-class CompilerSet: cset:list[CompilerPair]; ctrl_var:ContextVar|None = None # noqa: E702
+class CompilerSet: cset:list[tuple[type[Renderer]|functools.partial, ContextVar|None]]; ctrl_var:ContextVar|None = None # noqa: E702

 class Compiled:
  profile_events:list[ProfileEvent] = [ProfileDeviceEvent("CPU")] # NOTE: CPU is the default device.

  def __init__(self, device:str, allocator:Allocator, compilers:CompilerSet|None, runtime, graph=None, group_id=None):
+    from tinygrad.renderer import Renderer
+
    self.device, self.allocator, self.runtime, self.graph, self.group_id = device, allocator, runtime, graph, group_id

    self.comps_ctrl_var = compilers.ctrl_var if compilers is not None else None
-    self.comp_sets:dict[Any, tuple[ContextVar|None, tuple[type[Renderer]|functools.partial, type[Compiler]|functools.partial|None]]] = {}
-    self.cached_pair:dict[Any, tuple[Renderer, Compiler|None]] = {}
-    for cpair in (compilers.cset if compilers is not None else [CompilerPair(Renderer, Compiler)]):
-      self.comp_sets[cpair.name or self._compiler_name(cpair.renderer, cpair.compiler)] = (cpair.ctrl_var, (cpair.renderer, cpair.compiler))
+    self.comp_sets:dict[str, tuple[ContextVar|None, type[Renderer]|functools.partial]] = {}
+    self.cached_pair:dict[Any, Renderer] = {}
+    for ren, var in (compilers.cset if compilers is not None else [(Renderer, None)]):
+      self.comp_sets[var.key.split('_', 1)[-1] if var is not None else self._compiler_name(ren)] = (var, ren)

  @property
-  def renderer(self) -> Renderer: return self._select_compiler_pair()[0]
+  def renderer(self) -> Renderer: return self._select_compiler_pair()

  @property
  def compiler(self) -> Compiler:
-    if (ret:=self.renderer.compiler or self._select_compiler_pair()[1]) is None: raise RuntimeError(f"no compiler for {self.device}")
+    if (ret:=self.renderer.compiler) is None: raise RuntimeError(f"no compiler for {self.device}")
    return ret

-  def _compiler_name(self, r:type[Renderer]|functools.partial, c:type[Compiler]|functools.partial|None) -> str:
-    devname = self.device.split(':')[0].upper()
-    if c is None: return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname) or devname
-    return unwrap_class_type(c).__name__.upper().removesuffix("COMPILER").removeprefix(devname) or devname
+  def _compiler_name(self, r:type[Renderer]|functools.partial) -> str:
+    return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname:=self.device.split(':')[0].upper()) or devname

-  def _select_compiler_pair(self) -> tuple[Renderer, Compiler|None]:
+  def _select_compiler_pair(self) -> Renderer:
    # select forced compiler from global env var.
    forced_comps = set([self.comp_sets[val][1]] if self.comps_ctrl_var is not None and (val:=self.comps_ctrl_var.value) else [])

@ -399,18 +394,18 @@ def enumerate_devices_str() -> Generator[str, None, None]:
      d = Device[device]
      default_comp_pairs, default_compiler, cc_ctrl_var = d.comp_sets, d.compiler, d.comps_ctrl_var
      try:
-        for k,(en,(r,c)) in default_comp_pairs.items():
-          d.comp_sets = {k:(None,(r,c))} # env var set to None, so it doesn't interfere
+        for k,(en,r) in default_comp_pairs.items():
+          d.comp_sets = {k:(None,r)} # env var set to None, so it doesn't interfere
          d.comps_ctrl_var = None
          try:
            # d.renderer, d.compiler = r(), c()
            with Context(CACHELEVEL=0): test = (Tensor([1,2,3], device=device) * 2).tolist()
            if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
-            set_text = f'({cc_ctrl_var.key}={d._compiler_name(r, c)} to make default)' if cc_ctrl_var is not None else ''
+            set_text = f'({cc_ctrl_var.key}={d._compiler_name(r)} to make default)' if cc_ctrl_var is not None else ''
            default_text = '(default)' if type(default_compiler) is type(d.compiler) else set_text
-            compilers_results.append(f"{colored('+', 'green')} {d._compiler_name(r, c)} {default_text}")
+            compilers_results.append(f"{colored('+', 'green')} {d._compiler_name(r)} {default_text}")
            any_works = True
-          except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._compiler_name(r, c)}: {e}")
+          except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._compiler_name(r)}: {e}")
      finally:
        # put the defaults back!
        d.comp_sets, d.comps_ctrl_var = default_comp_pairs, cc_ctrl_var
--- a/tinygrad/helpers.py
+++ b/tinygrad/helpers.py
@ -185,11 +185,12 @@ ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE = ContextVar("ALLOW_DEVICE_USAGE", 1), Conte
 EMULATE, EMULATED_DTYPES = ContextVar("EMULATE", ""), ContextVar("EMULATED_DTYPES", "")
 CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
 # Compilers
-CPU_LLVM, CPU_LVP, AMD_LLVM = ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0), ContextVar("AMD_LLVM", 0)
-NV_PTX, CUDA_PTX, NV_NAK, QCOM_IR3 = ContextVar("NV_PTX", 0), ContextVar("CUDA_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("QCOM_IR3", 0)
+CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
+NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0)
+CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
 NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
-AMD_CC, CPU_CC, NV_CC, CUDA_CC = ContextVar("AMD_CC", ""), ContextVar("CPU_CC", ""), ContextVar("NV_CC", ""), ContextVar("CUDA_CC", "")
-QCOM_CC = ContextVar("QCOM_CC", "")
+AMD_CC, AMD_LLVM, AMD_HIPCC  = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
+QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
 # VIZ implies PROFILE, but you can run PROFILE without VIZ
 VIZ = ContextVar("VIZ", 0)
 PROFILE = ContextVar("PROFILE", abs(VIZ.value))
--- a/tinygrad/renderer/init.py
+++ b/tinygrad/renderer/init.py
@ -1,5 +1,5 @@
 from __future__ import annotations
-from typing import Callable, cast, TYPE_CHECKING
+from typing import Callable, cast
 import functools
 from dataclasses import dataclass, field
 from tinygrad.helpers import to_function_name, dedup, prod, DEBUG
@ -7,7 +7,7 @@ from tinygrad.uop.ops import Ops, UOp, sym_infer, sint, Variable, ssimplify, Gro
 from tinygrad.dtype import AddrSpace, PtrDType
 from tinygrad.codegen.opt.tc import TensorCore
 from tinygrad.codegen.opt import Opt
-if TYPE_CHECKING: from tinygrad.device import Compiler
+from tinygrad.device import Compiler

@dataclass(frozen=True)
 class Estimates:
@ -150,7 +150,8 @@ class Renderer:
  pre_matcher: PatternMatcher|None = None
  extra_matcher: PatternMatcher|None = None
  code_for_op: dict[Ops, Callable] = {}
-  compiler: Compiler|None = None
+
+  compiler: Compiler = Compiler()

  def __reduce__(self): return self.__class__, ()
  def render(self, uops:list[UOp]) -> str: raise NotImplementedError("needs a renderer")
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@ -3,10 +3,9 @@ import os, math, sys, struct
 from collections import defaultdict, Counter
 from tinygrad.codegen.opt import tc
 from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str, axis_letters
-from tinygrad.helpers import strip_parens, getenv, prod, dedup, select_first_inited, AMX, CPU_COUNT
+from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX, CPU_COUNT
 from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, AddrSpace, truncate, float_to_bf16
 from tinygrad.renderer import Renderer
-from tinygrad.device import Compiler
 from tinygrad.codegen.late.devectorizer import no_vectorized_alu


@ -343,8 +342,7 @@ class MetalRenderer(CStyleLanguage):
  shared_max = 32768
  def __init__(self):
    from tinygrad.runtime.ops_metal import MetalCompiler
-    self.compiler = select_first_inited([MetalCompiler, Compiler], "No compiler for METAL is available")
-    self.tensor_cores = tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
+    self.compiler, self.tensor_cores = MetalCompiler(), tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []

  # language options
  kernel_typedef = "kernel void"
--- a/tinygrad/runtime/autogen/init.py
+++ b/tinygrad/runtime/autogen/init.py
@ -115,7 +115,7 @@ def __getattr__(nm):
      return load("rocprof", "['rocprof-trace-decoder', p:='/usr/local/lib/rocprof-trace-decoder.so', p.replace('so','dylib')]",
                  [f"{{}}/include/{s}.h" for s in ["rocprof_trace_decoder", "trace_decoder_instrument", "trace_decoder_types"]],
                  tarball="https://github.com/ROCm/rocprof-trace-decoder/archive/dd0485100971522cc4cd8ae136bdda431061a04d.tar.gz")
-    case "mesa": return load("mesa", "['tinymesa_cpu', 'tinymesa']", [
+    case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu']", [
        *[f"{{}}/src/compiler/nir/{s}.h" for s in ["nir", "nir_builder", "nir_shader_compiler_options", "nir_serialize"]], "{}/gen/nir_intrinsics.h",
        *[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
        *[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
@ -134,7 +134,8 @@ def __getattr__(nm):
          *[f"python3 src/compiler/{s}_h.py > gen/{s.split('/')[-1]}.h" for s in ["nir/nir_opcodes", "nir/nir_builder_opcodes"]],
          *[f"python3 src/compiler/nir/nir_{s}_h.py --outdir gen" for s in ["intrinsics", "intrinsics_indices"]]]), cwd=path, shell=True, check=True),
  tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
-  prolog=["import gzip, base64"], epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
+  prolog=["from tinygrad.helpers import CPU_CC, CPU_LVP", "import gzip, base64"],
+  epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
    case "libclang":
      return load("libclang", clang_lib,
                  lambda: [f"{system('llvm-config-20 --includedir')}/clang-c/{s}.h" for s in ["Index", "CXString", "CXSourceLocation", "CXFile"]],
--- a/tinygrad/runtime/autogen/mesa.py
+++ b/tinygrad/runtime/autogen/mesa.py
@ -4,8 +4,9 @@ import ctypes
 from typing import Annotated, Literal, TypeAlias
 from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
 from tinygrad.runtime.support import c
+from tinygrad.helpers import CPU_CC, CPU_LVP
 import gzip, base64
-dll = c.DLL('mesa', ['tinymesa_cpu', 'tinymesa'])
+dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu'])
 class struct_u_printf_info(ctypes.Structure): pass
 u_printf_info: TypeAlias = struct_u_printf_info
 uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32]
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@ -6,9 +6,9 @@ from dataclasses import dataclass
 from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
 from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filter_visible_devices
 from tinygrad.uop.ops import sint
-from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair
+from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet
 from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
-from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv, unwrap
+from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, AMD_HIPCC, ceildiv, unwrap
 from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
 from tinygrad.renderer.llvmir import AMDLLVMRenderer
 from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt, amdgpu_kd, amdgpu_drm
@ -962,9 +962,9 @@ class AMDDevice(HCQCompiled):
    self.sdma_queues:dict = {}
    self.has_sdma_queue = self.sdma_queue(0) is not None

-    compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
-                             CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM),
-                             CompilerPair(functools.partial(AMDHIPCCRenderer, self.arch), None)], ctrl_var=AMD_CC)
+    compilers = CompilerSet([(functools.partial(AMDHIPRenderer, self.arch), None),
+                             (functools.partial(AMDLLVMRenderer, self.arch), AMD_LLVM),
+                             (functools.partial(AMDHIPCCRenderer, self.arch), AMD_HIPCC)], ctrl_var=AMD_CC)

    super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
                     functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),
--- a/tinygrad/runtime/ops_cl.py
+++ b/tinygrad/runtime/ops_cl.py
@ -5,7 +5,7 @@ from tinygrad.runtime.autogen import opencl as cl
 from tinygrad.runtime.support import c
 from tinygrad.helpers import to_char_p_p, from_mv, OSX, DEBUG, mv_address, suppress_finalizing
 from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
-from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError, CompilerPair, CompilerSet
+from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError, CompilerSet
 from tinygrad.dtype import ImageDType

 CC_CB = c.CFUNCTYPE[None, [c.POINTER[ctypes.c_char], c.POINTER[None], cl.size_t, c.POINTER[None]]]
@ -39,9 +39,9 @@ class CLCompiler(Compiler):

 class CLProgram:
  def __init__(self, device:CLDevice, name:str, lib:bytes, buf_dtypes=[], **kwargs):
-    self.dev, self.name, self.lib, self.buf_dtypes = device, name, lib, buf_dtypes
-    self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
-                                                        to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
+    self.dev, self.name, self.lib, self.buf_dtypes = device, name, device.clc.compile_cached(lib.decode()), buf_dtypes
+    self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(self.lib)),
+                                                        to_char_p_p([self.lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
                                                        errcode_ret := ctypes.c_int32()), errcode_ret)
    check(binary_status.value)
    check(cl.clBuildProgram(self.program, 1, device.device_id, None, BP_CB(), None)) # NOTE: OSX requires this
@ -125,8 +125,8 @@ class CLDevice(Compiled):
                                           ctypes.string_at(buf, size=total.value).decode())[1]

    renderer = IntelRenderer if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts else OpenCLRenderer
-    compiler = functools.partial(CLCompiler, self, f"{hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()}")
-    super().__init__(device, CLAllocator(self), CompilerSet([CompilerPair(renderer, compiler)]), functools.partial(CLProgram, self))
+    self.clc = CLCompiler(self, f"{hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()}")
+    super().__init__(device, CLAllocator(self), CompilerSet([(renderer, None)]), functools.partial(CLProgram, self))
  def synchronize(self):
    check(cl.clFinish(self.queue))
    self.pending_copyin.clear()
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@ -2,7 +2,7 @@ from __future__ import annotations
 import platform, sys, ctypes, functools, time, mmap, threading, queue
 from tinygrad.helpers import to_mv, OSX, WIN, mv_address, wait_cond, suppress_finalizing, unwrap, data64_le
 from tinygrad.helpers import CPU_CC, CPU_LVP, CPU_LLVM
-from tinygrad.device import BufferSpec, DMACPURef, CompilerSet, CompilerPair
+from tinygrad.device import BufferSpec, DMACPURef, CompilerSet
 from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
 from tinygrad.runtime.support.hcq import CLikeArgsState
 from tinygrad.renderer.cstyle import ClangJITRenderer
@ -133,6 +133,5 @@ class CPUDevice(HCQCompiled):
  def __init__(self, device:str=""):
    self.tasks:queue.Queue = queue.Queue()
    CPUWorker(self, self.tasks, thread_id=0).start()
-    compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(CPULLVMRenderer, None, ctrl_var=CPU_LLVM),
-                             CompilerPair(LVPRenderer, None, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC)
+    compilers = CompilerSet([(ClangJITRenderer, None), (CPULLVMRenderer, CPU_LLVM), (LVPRenderer, CPU_LVP)], ctrl_var=CPU_CC)
    super().__init__(device, CPUAllocator(self), compilers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@ -1,7 +1,7 @@
 from __future__ import annotations
 import ctypes, functools
-from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing, CUDA_CC, CUDA_PTX
-from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerPair, CompilerSet
+from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing, CUDA_CC, CUDA_PTX, CUDA_NVCC
+from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerSet
 from tinygrad.renderer.cstyle import CUDARenderer
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.runtime.autogen import cuda
@ -118,9 +118,9 @@ class CUDADevice(Compiled):
    CUDADevice.devices.append(self)

    from tinygrad.runtime.graph.cuda import CUDAGraph
-    compilers = CompilerSet([CompilerPair(functools.partial(CUDARenderer, self.arch, device="CUDA")),
-                             CompilerPair(functools.partial(PTXRenderer, self.arch, device="CUDA"), ctrl_var=CUDA_PTX),
-                             CompilerPair(functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True), name="NVCC")], ctrl_var=CUDA_CC)
+    compilers = CompilerSet([(functools.partial(CUDARenderer, self.arch, device="CUDA"), None),
+                             (functools.partial(PTXRenderer, self.arch, device="CUDA"), CUDA_PTX),
+                             (functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True), CUDA_NVCC)], ctrl_var=CUDA_CC)
    super().__init__(device, CUDAAllocator(self), compilers, functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)

  def synchronize(self):
--- a/tinygrad/runtime/ops_dsp.py
+++ b/tinygrad/runtime/ops_dsp.py
@ -1,7 +1,7 @@
 from __future__ import annotations
 import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
 assert sys.platform != 'win32'
-from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, CompilerSet, CompilerPair
+from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, CompilerSet
 from tinygrad.dtype import dtypes, DType, PtrDType
 from tinygrad.uop.ops import Ops, UOp
 from tinygrad.helpers import getenv, round_up, mv_address, to_mv, cpu_objdump, system, DEBUG, suppress_finalizing
@ -146,10 +146,10 @@ class DSPCompiler(Compiler):

 class DSPDevice(Compiled):
  def __init__(self, device:str=""):
-    if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), CompilerSet([CompilerPair(MockDSPRenderer)]), MockDSPProgram)
+    if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), CompilerSet([(MockDSPRenderer, None)]), MockDSPProgram)
    else:
      self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
-      super().__init__(device, DSPAllocator(self), CompilerSet([CompilerPair(DSPRenderer)]), functools.partial(DSPProgram, self))
+      super().__init__(device, DSPAllocator(self), CompilerSet([(DSPRenderer, None)]), functools.partial(DSPProgram, self))
      fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
      self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferSpec(nolru=True))
      ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@ -1,6 +1,6 @@
 import ctypes, functools
 from tinygrad.helpers import mv_address, getenv, suppress_finalizing
-from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet, CompilerPair
+from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet
 from tinygrad.runtime.autogen import hip
 from tinygrad.renderer.cstyle import HIPRenderer
 from tinygrad.runtime.support.c import init_c_var, init_c_struct_t
@ -15,7 +15,7 @@ class HIPDevice(Compiled):
    self.arch = init_c_var(hip.hipDeviceProp_t, lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
    self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t, lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]

-    compilers = CompilerSet([CompilerPair(functools.partial(HIPRenderer, self.arch), None)])
+    compilers = CompilerSet([(functools.partial(HIPRenderer, self.arch), None)])
    super().__init__(device, HIPAllocator(self), compilers, functools.partial(HIPProgram, self))
  def synchronize(self):
    check(hip.hipSetDevice(self.device_id))
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@ -1,7 +1,7 @@
-import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform
-from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
+import subprocess, pathlib, struct, ctypes, tempfile, functools, decimal, platform
+from tinygrad.helpers import prod, to_mv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
 import tinygrad.runtime.support.objc as objc
-from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet, CompilerPair
+from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet
 from tinygrad.renderer.cstyle import MetalRenderer
 from tinygrad.runtime.autogen import metal
 from tinygrad.runtime.support.c import DLL
@ -42,7 +42,7 @@ class MetalDevice(Compiled):
    from tinygrad.runtime.graph.metal import MetalGraph
    # NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
    # This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
-    super().__init__(device, MetalAllocator(self), CompilerSet([CompilerPair(MetalRenderer, None)]),
+    super().__init__(device, MetalAllocator(self), CompilerSet([(MetalRenderer, None)]),
      functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(self.sysdevice.name()).lower() else None)

  def synchronize(self):
@ -54,20 +54,12 @@ class MetalDevice(Compiled):
        Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en)]
    self.mtl_buffers_in_flight.clear()

-def metal_src_to_library(device:MetalDevice, src:str) -> metal.MTLLibrary:
-  options = metal.MTLCompileOptions.new()
-  options.setFastMathEnabled(getenv("METAL_FAST_MATH"))
-  library = device.sysdevice.newLibraryWithSource_options_error(to_ns_str(src), options, ctypes.byref(compileError:=metal.NSError().retained()))
-  error_check(compileError, CompileError)
-  return library
-
 class MetalCompiler(Compiler):
  # Opening METAL after LLVM doesn't fail because ctypes.CDLL opens with RTLD_LOCAL but MTLCompiler opens it's own llvm with RTLD_GLOBAL
  # This means that MTLCompiler's llvm will create it's own instances of global state because RTLD_LOCAL doesn't export symbols, but if RTLD_GLOBAL
  # library is loaded first then RTLD_LOCAL library will just use it's symbols. On linux there is RTLD_DEEPBIND to prevent that, but on macos there
  # doesn't seem to be anything we can do.
-  with contextlib.suppress(FileNotFoundError, ModuleNotFoundError):
-    import tinygrad.runtime.autogen.llvm # noqa: F401
+  import tinygrad.runtime.autogen.llvm as _
  support = DLL("MTLCompiler", "MTLCompiler")
  support.MTLCodeGenServiceCreate.restype = ctypes.c_void_p

@ -118,15 +110,9 @@ class MetalCompiler(Compiler):
 class MetalProgram:
  def __init__(self, dev:MetalDevice, name:str, lib:bytes, **kwargs):
    self.dev, self.name, self.lib = dev, name, lib
-    if lib[:4] == b"MTLB":
-      # binary metal library
-      data = objc.dispatch_data_create(lib, len(lib), None, None)
-      self.library = self.dev.sysdevice.newLibraryWithData_error(data, ctypes.byref(error_lib:=metal.NSError().retained())).retained()
-      error_check(error_lib)
-    else:
-      # metal source. rely on OS caching
-      try: self.library = metal_src_to_library(self.dev, lib.decode())
-      except CompileError as e: raise RuntimeError from e
+    data = objc.dispatch_data_create(lib, len(lib), None, None)
+    self.library = self.dev.sysdevice.newLibraryWithData_error(data, ctypes.byref(error_lib:=metal.NSError().retained())).retained()
+    error_check(error_lib)
    self.fxn = self.library.newFunctionWithName(to_ns_str(name)).retained()
    descriptor = metal.MTLComputePipelineDescriptor.new()
    descriptor.setComputeFunction(self.fxn)
--- a/tinygrad/runtime/ops_null.py
+++ b/tinygrad/runtime/ops_null.py
@ -1,5 +1,5 @@
 import functools
-from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
+from tinygrad.device import Compiled, Allocator, CompilerSet
 from tinygrad.engine.jit import MultiGraphRunner
 from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer
 from tinygrad.uop.ops import Ops
@ -39,6 +39,6 @@ class NullDevice(Compiled):
      case "AMD_CDNA4": renderer = functools.partial(AMDHIPRenderer, "gfx950")
      case "": renderer = NullRenderer
      case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
-    compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630
-                             CompilerPair(functools.partial(NAKRenderer, "sm_120", 48), None, NULL_NAK)]) # 5090
+    compilers = CompilerSet([(renderer, None), (functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
+                             (functools.partial(NAKRenderer, "sm_120", 48), NULL_NAK)]) # 5090
    super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph)
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@ -6,7 +6,7 @@ from dataclasses import dataclass
 from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
 from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
 from tinygrad.uop.ops import sint
-from tinygrad.device import Compiled, BufferSpec, CompilerPair, CompilerSet
+from tinygrad.device import Compiled, BufferSpec, CompilerSet
 from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
 from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
 from tinygrad.renderer.ptx import PTXRenderer
@ -619,9 +619,9 @@ class NVDevice(HCQCompiled[NVSignal]):
    self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
    self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)

-    compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(CUDARenderer, self.arch)),
-       CompilerPair(functools.partial(PTXRenderer, self.arch, device="NV"), ctrl_var=NV_PTX),
-       CompilerPair(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), ctrl_var=NV_NAK)])
+    compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
+       (functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
+       (functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)])
    super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)

    self.pma_enabled = PMA.value > 0 and PROFILE >= 1
--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@ -6,7 +6,7 @@ from typing import Any, TYPE_CHECKING
 import pickle, base64, itertools, time, struct, sys, functools
 from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate, float_to_fp16, float_to_bf16, float_to_fp8, fp8_to_float
 from tinygrad.helpers import all_same, getenv, flatten, get_single_element, EMULATE
-from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
+from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet
 from tinygrad.codegen.opt import tc
 from tinygrad.uop.ops import exec_alu, python_alu, Ops, UOp, GroupOp
 from tinygrad.renderer import Renderer
@ -248,4 +248,4 @@ class PythonAllocator(Allocator['PythonDevice']):

 class PythonDevice(Compiled):
  def __init__(self, device:str):
-    super().__init__(device, PythonAllocator(self), CompilerSet([CompilerPair(PythonRenderer, None)]), PythonProgram)
+    super().__init__(device, PythonAllocator(self), CompilerSet([(PythonRenderer, None)]), PythonProgram)
--- a/tinygrad/runtime/ops_qcom.py
+++ b/tinygrad/runtime/ops_qcom.py
@ -2,15 +2,15 @@ from __future__ import annotations
 import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
 assert sys.platform != 'win32'
 from typing import Any
-from tinygrad.device import BufferSpec, CompilerSet, CompilerPair, Device
+from tinygrad.device import BufferSpec, CompilerSet, Device
 from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
 from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
 from tinygrad.runtime.autogen import kgsl, mesa
-from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
+from tinygrad.runtime.ops_cl import CLDevice
 from tinygrad.renderer.cstyle import QCOMRenderer
 from tinygrad.renderer.nir import IR3Renderer
 from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, fromimport, cpu_profile, lo32, suppress_finalizing
-from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE
+from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE, DEBUG
 from tinygrad.dtype import ImageDType, dtypes
 from tinygrad.runtime.support.system import System
 if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl  # noqa: F401  # pylint: disable=unused-import
@ -49,10 +49,6 @@ def pkt7_hdr(opcode: int, cnt: int): return mesa.CP_TYPE7_PKT | cnt & 0x3FFF | p
 def pkt4_hdr(reg: int, cnt: int): return mesa.CP_TYPE4_PKT | cnt & 0x7F | parity(cnt) << 7 | (reg & 0x3FFFF) << 8 | parity(reg) << 27

 def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
-class QCOMCompiler(CLCompiler):
-  def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
-  def disassemble(self, lib:bytes):
-    fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])

 class QCOMSignal(HCQSignal):
  def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 19.2})
@ -231,7 +227,7 @@ class QCOMArgsState(HCQArgsState):
 class QCOMProgram(HCQProgram):
  def __init__(self, dev: QCOMDevice, name: str, lib: bytes, buf_dtypes=[], **kwargs):
    self.dev: QCOMDevice = dev
-    self.buf_dtypes, self.name, self.lib, self.NIR = buf_dtypes, name, lib, isinstance(dev.renderer, IR3Renderer)
+    self.buf_dtypes, self.name, self.NIR = buf_dtypes, name, isinstance(dev.renderer, IR3Renderer)

    if self.NIR:
      from tinygrad.runtime.support.compiler_mesa import IR3Compiler
@ -252,7 +248,9 @@ class QCOMProgram(HCQProgram):
      self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
      self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
      self.consts_info:list[tuple] = []
-    else: self._parse_lib()
+    else:
+      self._parse_lib(lib:=self.dev.cl_dev.clc.compile_cached(lib.decode()))
+      if DEBUG >= 7: fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])

    self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
    to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
@ -274,21 +272,21 @@ class QCOMProgram(HCQProgram):
      raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
    return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)

-  def _parse_lib(self):
+  def _parse_lib(self, lib):
    # Extract image binary
-    self.image_size = _read_lib(self.lib, 0x100)
-    self.image = bytearray(self.lib[(image_offset:=_read_lib(self.lib, 0xc0)):image_offset+self.image_size])
+    self.image_size = _read_lib(lib, 0x100)
+    self.image = bytearray(lib[(image_offset:=_read_lib(lib, 0xc0)):image_offset+self.image_size])

    # Parse image descriptors
-    image_desc_off = _read_lib(self.lib, 0x110)
-    self.prg_offset, self.brnchstck = _read_lib(self.lib, image_desc_off+0xc4), _read_lib(self.lib, image_desc_off+0x108) // 2
-    self.pvtmem, self.shmem = _read_lib(self.lib, image_desc_off+0xc8), _read_lib(self.lib, image_desc_off+0xd8)
+    image_desc_off = _read_lib(lib, 0x110)
+    self.prg_offset, self.brnchstck = _read_lib(lib, image_desc_off+0xc4), _read_lib(lib, image_desc_off+0x108) // 2
+    self.pvtmem, self.shmem = _read_lib(lib, image_desc_off+0xc8), _read_lib(lib, image_desc_off+0xd8)

    # Fill up constants and buffers info
    self.consts_info = []

    # Collect sampler info.
-    self.samp_cnt = samp_cnt_in_file = _read_lib(self.lib, image_desc_off + 0xdc)
+    self.samp_cnt = samp_cnt_in_file = _read_lib(lib, image_desc_off + 0xdc)
    assert self.samp_cnt <= 1, "Up to one sampler supported"
    if self.samp_cnt:
      self.samp_cnt += 1
@ -298,8 +296,8 @@ class QCOMProgram(HCQProgram):

    # Collect kernel arguments (buffers) info.
    bdoff, binfos = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samp_cnt_in_file, []
-    while bdoff + 32 <= len(self.lib):
-      length, _, _, offset_words, _, _, _, typ = struct.unpack("8I", self.lib[bdoff:bdoff+32])
+    while bdoff + 32 <= len(lib):
+      length, _, _, offset_words, _, _, _, typ = struct.unpack("8I", lib[bdoff:bdoff+32])
      if length == 0: break
      binfos.append((offset_words * 4, typ))
      bdoff += length
@ -309,16 +307,16 @@ class QCOMProgram(HCQProgram):
    self.tex_cnt, self.ibo_cnt = sum(typ is BUFTYPE_TEX for _,typ in binfos), sum(typ is BUFTYPE_IBO for _,typ in binfos)
    self.ibo_off, self.tex_off, self.samp_off = 2048, 2048 + 0x40 * self.ibo_cnt, 2048 + 0x40 * self.tex_cnt + 0x40 * self.ibo_cnt

-    if _read_lib(self.lib, 0xb0) != 0: # check if we have constants.
-      cdoff = _read_lib(self.lib, 0xac)
+    if _read_lib(lib, 0xb0) != 0: # check if we have constants.
+      cdoff = _read_lib(lib, 0xac)
      while cdoff + 40 <= image_offset:
-        cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
+        cnst, offset_words, _, is32 = struct.unpack("I", lib[cdoff:cdoff+4])[0], *struct.unpack("III", lib[cdoff+16:cdoff+28])
        self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
        cdoff += 40

    # Registers info
-    reg_desc_off = _read_lib(self.lib, 0x34)
-    self.fregs, self.hregs = _read_lib(self.lib, reg_desc_off + 0x14), _read_lib(self.lib, reg_desc_off + 0x18)
+    reg_desc_off = _read_lib(lib, 0x34)
+    self.fregs, self.hregs = _read_lib(lib, reg_desc_off + 0x14), _read_lib(lib, reg_desc_off + 0x18)

 class QCOMTextureInfo:
  def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
@ -385,8 +383,8 @@ class QCOMDevice(HCQCompiled):
    if PROFILE and self.gpu_id[:2] < (7, 3):
      System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")

-    compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[CompilerPair(QCOMRenderer, functools.partial(QCOMCompiler, device)),
-                                                    CompilerPair(functools.partial(IR3Renderer, info.chip_id), None, QCOM_IR3)])
+    self.cl_dev = CLDevice(device)
+    compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(QCOMRenderer, None), (functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
    super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
                     functools.partial(QCOMComputeQueue, self), None)

--- a/tinygrad/runtime/ops_webgpu.py
+++ b/tinygrad/runtime/ops_webgpu.py
@ -1,5 +1,5 @@
 import functools, struct
-from tinygrad.device import  Compiled, Allocator, Compiler, BufferSpec, CompilerSet, CompilerPair
+from tinygrad.device import Compiled, Allocator, BufferSpec, CompilerSet
 from tinygrad.renderer.wgsl import WGSLRenderer
 from tinygrad.helpers import round_up, suppress_finalizing
 from tinygrad.runtime.autogen import webgpu
@ -217,7 +217,7 @@ class WebGpuDevice(Compiled):
    self.device_res = _run(webgpu.wgpuAdapterRequestDeviceF, webgpu.WGPURequestDeviceCallbackInfo, webgpu.WGPURequestDeviceCallback,
      webgpu.WGPURequestDeviceStatus, 1, 2, adapter_res, dev_desc)

-    super().__init__(device, WebGpuAllocator(self), CompilerSet([CompilerPair(WGSLRenderer, Compiler)]),
+    super().__init__(device, WebGpuAllocator(self), CompilerSet([(WGSLRenderer, None)]),
      functools.partial(WebGPUProgram, (self.device_res, webgpu.WGPUFeatureName_TimestampQuery in supported)))

  def synchronize(self):