pickle

more
fix
2026-06-24 02:14:17 +00:00 · 2025-12-23 19:34:04 -05:00 · 2025-12-23 19:22:39 -05:00 · 2025-12-23 18:28:13 -05:00 · 2025-12-23 18:15:58 -05:00
13 changed files with 95 additions and 31 deletions
--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@ -334,7 +334,10 @@ class IntelRenderer(OpenCLRenderer):
 class MetalRenderer(CStyleLanguage):
  device = "METAL"
  shared_max = 32768
-  def __init__(self): self.tensor_cores = tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
+  def __init__(self):
+    self.tensor_cores = tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
+    from tinygrad.runtime.ops_metal import MetalCompiler
+    self.compiler = MetalCompiler()

  # language options
  kernel_typedef = "kernel void"
@ -446,6 +449,18 @@ class CUDARenderer(CStyleLanguage):

    return super().render_kernel(function_name, kernel, bufs, uops, prefix=prefix)

+class CUDACUDARenderer(CUDARenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch)
+    from tinygrad.runtime.support.compiler_cuda import CUDACompiler
+    self.compiler = CUDACompiler(arch)
+
+class CUDANVCCRenderer(CUDARenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch)
+    from tinygrad.runtime.support.compiler_cuda import NVCCCompiler
+    self.compiler = NVCCCompiler(arch)
+
 class AMDRenderer(CStyleLanguage):
  device = "AMD"
  shared_max = 65536
@ -538,6 +553,32 @@ class AMDRenderer(CStyleLanguage):
  for (int n = 0; n < 8; n++) { d[n] = c_frag[n*2]; } return d;\n}""")
    return super().render_kernel(function_name, kernel, bufs, uops, prefix)

+class AMDHIPRenderer(AMDRenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch)
+    from tinygrad.runtime.support.compiler_amd import HIPCompiler
+    self.compiler = HIPCompiler(arch)
+
+class AMDHIPCCRenderer(AMDRenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch)
+    from tinygrad.runtime.support.compiler_amd import HIPCCCompiler
+    self.compiler = HIPCCCompiler(arch)
+
 class NVRenderer(CUDARenderer): device = "NV"
+
+class NVNVRenderer(NVRenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch)
+    from tinygrad.runtime.support.compiler_cuda import NVCompiler
+    self.compiler = NVCompiler(arch)
+
 class HIPRenderer(AMDRenderer): device = "HIP"
+
+class HIPHIPRenderer(HIPRenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch)
+    from tinygrad.runtime.support.compiler_amd import HIPCompiler
+    self.compiler = HIPCompiler(arch)
+
 class QCOMRenderer(OpenCLRenderer): device = "QCOM"
--- a/tinygrad/renderer/llvmir.py
+++ b/tinygrad/renderer/llvmir.py
@ -143,6 +143,9 @@ class LLVMRenderer(Renderer):
  if AMX: tensor_cores = tc.amx

  extra_matcher = create_non_native_float_pats((dtypes.bfloat16,)) + pm_manual_bf16_cast
+  def __init__(self):
+    from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler
+    self.compiler = CPULLVMCompiler()
  def render(self, uops: list[UOp]) -> str: return "\n".join((k:=self._render_kernel(uops))[0] + (k[1], self._render_footer(uops)))
  def _render_footer(self, uops: list[UOp]) -> str: return 'attributes #0 = { alwaysinline nounwind "no-builtins" "no-trapping-math"="true" }'
  def _render_fn(self, name:str, args:list[tuple[str,DType]], kernel:list[str], prefix:list[str]|None=None) -> str:
@ -254,7 +257,9 @@ exit: %packed = phi i32 [%packed_bf8, %do_bf8], [%packed_fp8, %do_fp8]\n  %trunc
                  f'"amdgpu-flat-work-group-size"="1,{requiredMaxThreadsPerBlock}"', '"no-trapping-math"="true"']
    return 'attributes #0 = { ' + ' '.join(attributes) + ' }'
  def __init__(self, arch:str):
+    from tinygrad.runtime.support.compiler_amd import AMDLLVMCompiler
    self.arch = arch
+    self.compiler = AMDLLVMCompiler(arch)
    self.tensor_cores = AMDRenderer.get_tensor_cores(arch)
    self.is_cdna = AMDRenderer.is_cdna(arch)
    self.string_rewrite += PatternMatcher([(UPat(Ops.WMMA, name="wmma"), lambda ctx, wmma, cdna=self.is_cdna: render_wmma_amd(ctx, wmma, cdna))])
--- a/tinygrad/renderer/nir.py
+++ b/tinygrad/renderer/nir.py
@ -245,6 +245,11 @@ class LVPRenderer(NIRRenderer):
    srcs=lambda b, self: [nsrc(nimm(b, 0, dtypes.int)), nsrc(nimm(b, self.param_idx, dtypes.int))], also=lambda self, sz:
    setattr(self, "param_idx", self.param_idx+sz))(lambda self,b,x,sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_ubo))

+  def __init__(self):
+    from tinygrad.runtime.support.compiler_mesa import LVPCompiler
+    super().__init__()
+    self.compiler = LVPCompiler()
+
  def prerender(self, uops:list[UOp]):
    super().prerender(uops)
    self.param_sz = sum([8 if u.op == Ops.DEFINE_GLOBAL else u.dtype.itemsize for u in uops if u.op in (Ops.DEFINE_GLOBAL, Ops.DEFINE_VAR)])
--- a/tinygrad/renderer/ptx.py
+++ b/tinygrad/renderer/ptx.py
@ -240,3 +240,17 @@ class PTXRenderer(Renderer):

      if u.op is Ops.SPECIAL: kernel = [f".reg .u32 %{u.arg};"] + kernel
    return self.render_kernel(kernel, name, bufs, c.items(), uops)
+
+class CUDAPTXRenderer(PTXRenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch, "CUDA")
+    from tinygrad.runtime.support.compiler_cuda import PTXCompiler
+    self.compiler = PTXCompiler(arch)
+  def __reduce__(self): return self.__class__, (self.arch,)
+
+class NVPTXRenderer(PTXRenderer):
+  def __init__(self, arch:str):
+    super().__init__(arch, "NV")
+    from tinygrad.runtime.support.compiler_cuda import NVPTXCompiler
+    self.compiler = NVPTXCompiler(arch)
+  def __reduce__(self): return self.__class__, (self.arch,)
--- a/tinygrad/renderer/wgsl.py
+++ b/tinygrad/renderer/wgsl.py
@ -1,6 +1,7 @@
 from tinygrad.dtype import DType, PtrDType, dtypes, AddrSpace
 from tinygrad.uop.ops import UOp, Ops, PatternMatcher, UPat
 from tinygrad.renderer.cstyle import CStyleLanguage, base_rewrite, extra_pm
+from tinygrad.device import Compiler
 from tinygrad.helpers import strip_parens

 def sign_extend(val:UOp, sext_am:int):
@ -46,6 +47,7 @@ class WGSLRenderer(CStyleLanguage):
  global_max = (65535, 65535, 65535)
  local_max = (256, 256, 64)
  code_for_workitem = {"g": lambda x: f"i32(gindex.{'xyz'[int(x)]})", "l": lambda x: f"i32(lindex.{'xyz'[int(x)]})"}
+  def __init__(self): self.compiler = Compiler()
  extra_matcher = wgsl_matcher
  supports_float4 = False
  barrier = "workgroupBarrier();"
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@ -9,11 +9,10 @@ from tinygrad.uop.ops import sint
 from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair
 from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
 from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv
-from tinygrad.renderer.cstyle import AMDRenderer
+from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
 from tinygrad.renderer.llvmir import AMDLLVMRenderer
 from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt
 from tinygrad.runtime.autogen.am import am
-from tinygrad.runtime.support.compiler_amd import HIPCompiler, HIPCCCompiler, AMDLLVMCompiler
 from tinygrad.runtime.support.elf import elf_loader
 from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager
 from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, import_ip_offsets, import_pmc
@ -931,9 +930,9 @@ class AMDDevice(HCQCompiled):
    max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
    self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20))

-    compilers = CompilerSet([CompilerPair(functools.partial(AMDRenderer, self.arch), functools.partial(HIPCompiler, self.arch)),
-                             CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), functools.partial(AMDLLVMCompiler, self.arch), AMD_LLVM),
-                             CompilerPair(functools.partial(AMDRenderer, self.arch), functools.partial(HIPCCCompiler, self.arch))], ctrl_var=AMD_CC)
+    compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
+                             CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM),
+                             CompilerPair(functools.partial(AMDHIPCCRenderer, self.arch), None)], ctrl_var=AMD_CC)

    super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
                     functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),
--- a/tinygrad/runtime/ops_cpu.py
+++ b/tinygrad/runtime/ops_cpu.py
@ -8,7 +8,6 @@ from tinygrad.runtime.support.hcq import CLikeArgsState
 from tinygrad.renderer.cstyle import ClangJITRenderer
 from tinygrad.renderer.llvmir import LLVMRenderer
 from tinygrad.renderer.nir import LVPRenderer
-from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler
 from tinygrad.runtime.support.compiler_mesa import LVPCompiler
 from tinygrad.runtime.support.elf import jit_loader
 from tinygrad.uop.ops import sint
@ -136,6 +135,6 @@ class CPUDevice(HCQCompiled):
  def __init__(self, device:str=""):
    self.tasks:queue.Queue = queue.Queue()
    CPUWorker(self, self.tasks, thread_id=0).start()
-    compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(LLVMRenderer, CPULLVMCompiler, ctrl_var=CPU_LLVM),
-                             CompilerPair(LVPRenderer, LVPCompiler, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC)
+    compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(LLVMRenderer, None, ctrl_var=CPU_LLVM),
+                             CompilerPair(LVPRenderer, None, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC)
    super().__init__(device, CPUAllocator(self), compilers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@ -2,10 +2,10 @@ from __future__ import annotations
 import ctypes, functools
 from tinygrad.helpers import DEBUG, getenv, mv_address, init_c_var, init_c_struct_t, suppress_finalizing, CUDA_CC, CUDA_PTX
 from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerPair, CompilerSet
-from tinygrad.renderer.cstyle import CUDARenderer
-from tinygrad.renderer.ptx import PTXRenderer
+from tinygrad.renderer.cstyle import CUDACUDARenderer, CUDANVCCRenderer
+from tinygrad.renderer.ptx import CUDAPTXRenderer
 from tinygrad.runtime.autogen import cuda
-from tinygrad.runtime.support.compiler_cuda import pretty_ptx, CUDACompiler, PTXCompiler, NVCCCompiler
+from tinygrad.runtime.support.compiler_cuda import pretty_ptx
 if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401  # pylint: disable=unused-import
 if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported

@ -117,9 +117,9 @@ class CUDADevice(Compiled):
    CUDADevice.devices.append(self)

    from tinygrad.runtime.graph.cuda import CUDAGraph
-    compilers = CompilerSet([CompilerPair(functools.partial(CUDARenderer, self.arch), functools.partial(CUDACompiler, self.arch)),
-                             CompilerPair(functools.partial(PTXRenderer, self.arch), functools.partial(PTXCompiler, self.arch), CUDA_PTX),
-                             CompilerPair(functools.partial(CUDARenderer, self.arch), functools.partial(NVCCCompiler, self.arch))], ctrl_var=CUDA_CC)
+    compilers = CompilerSet([CompilerPair(functools.partial(CUDACUDARenderer, self.arch), None),
+                             CompilerPair(functools.partial(CUDAPTXRenderer, self.arch), None, CUDA_PTX),
+                             CompilerPair(functools.partial(CUDANVCCRenderer, self.arch), None)], ctrl_var=CUDA_CC)
    super().__init__(device, CUDAAllocator(self), compilers, functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)

  def synchronize(self):
--- a/tinygrad/runtime/ops_hip.py
+++ b/tinygrad/runtime/ops_hip.py
@ -2,8 +2,7 @@ import ctypes, functools
 from tinygrad.helpers import init_c_var, mv_address, init_c_struct_t, getenv
 from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet, CompilerPair
 from tinygrad.runtime.autogen import hip
-from tinygrad.runtime.support.compiler_amd import HIPCompiler
-from tinygrad.renderer.cstyle import HIPRenderer
+from tinygrad.renderer.cstyle import HIPHIPRenderer
 if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl  # noqa: F401 # pylint: disable=unused-import

 def check(status):
@ -15,7 +14,7 @@ class HIPDevice(Compiled):
    self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
    self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]

-    compilers = CompilerSet([CompilerPair(functools.partial(HIPRenderer, self.arch), functools.partial(HIPCompiler, self.arch))])
+    compilers = CompilerSet([CompilerPair(functools.partial(HIPHIPRenderer, self.arch), None)])
    super().__init__(device, HIPAllocator(self), compilers, functools.partial(HIPProgram, self))
  def synchronize(self):
    check(hip.hipSetDevice(self.device_id))
--- a/tinygrad/runtime/ops_metal.py
+++ b/tinygrad/runtime/ops_metal.py
@ -44,7 +44,7 @@ class MetalDevice(Compiled):
    from tinygrad.runtime.graph.metal import MetalGraph
    # NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
    # This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
-    super().__init__(device, MetalAllocator(self), CompilerSet([CompilerPair(MetalRenderer, MetalCompiler), CompilerPair(MetalRenderer, Compiler)]),
+    super().__init__(device, MetalAllocator(self), CompilerSet([CompilerPair(MetalRenderer, None)]),
      functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(self.sysdevice.name()).lower() else None)

  def synchronize(self):
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@ -8,9 +8,8 @@ from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU
 from tinygrad.uop.ops import sint
 from tinygrad.device import BufferSpec, CompilerPair, CompilerSet
 from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK
-from tinygrad.renderer.ptx import PTXRenderer
-from tinygrad.renderer.cstyle import NVRenderer
-from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, NVPTXCompiler, NVCompiler
+from tinygrad.renderer.ptx import CUDAPTXRenderer, NVPTXRenderer
+from tinygrad.renderer.cstyle import NVNVRenderer, CUDACUDARenderer
 from tinygrad.runtime.support.compiler_mesa import NAKCompiler
 from tinygrad.runtime.autogen import nv_570, nv_580, pci, mesa
 from tinygrad.runtime.support.elf import elf_loader
@ -583,9 +582,9 @@ class NVDevice(HCQCompiled[HCQSignal]):
    self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
    self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)

-    cucc, ptxcc = (CUDACompiler, PTXCompiler) if MOCKGPU else (NVCompiler, NVPTXCompiler)
-    compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(NVRenderer, self.arch),functools.partial(cucc, self.arch)),
-       CompilerPair(functools.partial(PTXRenderer, self.arch, device="NV"), functools.partial(ptxcc, self.arch), NV_PTX),
+    nvr, ptxr = (CUDACUDARenderer, CUDAPTXRenderer) if MOCKGPU else (NVNVRenderer, NVPTXRenderer)
+    compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(nvr, self.arch), None),
+       CompilerPair(functools.partial(ptxr, self.arch), None, NV_PTX),
       CompilerPair(functools.partial(NAKRenderer, dev=self), functools.partial(NAKCompiler, self.arch, self.max_warps_per_sm), NV_NAK)])
    super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), HCQSignal, NVComputeQueue, NVCopyQueue)

--- a/tinygrad/runtime/ops_python.py
+++ b/tinygrad/runtime/ops_python.py
@ -213,10 +213,14 @@ class PythonProgram:
        i += 1
    return time.perf_counter() - st

+class PythonCompiler(Compiler):
+  def compile(self, src:str) -> bytes: return base64.b64decode(src)
+
 class PythonRenderer(Renderer):
  device = "PYTHON"
  code_for_op = python_alu
  def __init__(self):
+    self.compiler = PythonCompiler()
    match cast(str, EMULATE.value):
      case "METAL": self.device, self.tensor_cores = "METAL", tc.metal
      case "AMD": self.device, self.tensor_cores = "AMD", tc.amd_rdna3
@ -235,9 +239,6 @@ class PythonRenderer(Renderer):
    lops = [(u.op, u.dtype, [uops.index(v) for v in u.src if u.op is not Ops.SPECIAL], u.arg) for u in uops]
    return base64.b64encode(pickle.dumps(lops)).decode()

-class PythonCompiler(Compiler):
-  def compile(self, src:str) -> bytes: return base64.b64decode(src)
-
 class PythonAllocator(Allocator['PythonDevice']):
  def _alloc(self, size, options): return memoryview(bytearray(size))
  def _copyin(self, dest, src:memoryview): dest[:] = src
@ -245,4 +246,4 @@ class PythonAllocator(Allocator['PythonDevice']):

 class PythonDevice(Compiled):
  def __init__(self, device:str):
-    super().__init__(device, PythonAllocator(self), CompilerSet([CompilerPair(PythonRenderer, PythonCompiler)]), PythonProgram)
+    super().__init__(device, PythonAllocator(self), CompilerSet([CompilerPair(PythonRenderer, None)]), PythonProgram)
--- a/tinygrad/runtime/ops_webgpu.py
+++ b/tinygrad/runtime/ops_webgpu.py
@ -1,5 +1,5 @@
 import functools, struct
-from tinygrad.device import  Compiled, Allocator, Compiler, BufferSpec, CompilerSet, CompilerPair
+from tinygrad.device import Compiled, Allocator, BufferSpec, CompilerSet, CompilerPair
 from tinygrad.renderer.wgsl import WGSLRenderer
 from tinygrad.helpers import round_up, suppress_finalizing
 from tinygrad.runtime.autogen import webgpu
@ -215,7 +215,7 @@ class WebGpuDevice(Compiled):
    device_res = _run(webgpu.wgpuAdapterRequestDeviceF, webgpu.WGPURequestDeviceCallbackInfo, webgpu.WGPURequestDeviceCallback,
      webgpu.WGPURequestDeviceStatus, 1, 2, adapter_res, dev_desc)

-    super().__init__(device, WebGpuAllocator(device_res), CompilerSet([CompilerPair(WGSLRenderer, Compiler)]),
+    super().__init__(device, WebGpuAllocator(device_res), CompilerSet([CompilerPair(WGSLRenderer, None)]),
      functools.partial(WebGPUProgram, (device_res, webgpu.WGPUFeatureName_TimestampQuery in supported)))

  def synchronize(self):
Author	SHA1	Message	Date
George Hotz	63447d50ef	pickle	2025-12-23 19:34:04 -05:00
George Hotz	2621e57c53	more	2025-12-23 19:22:39 -05:00
George Hotz	8c05401d5d	fix	2025-12-23 18:28:13 -05:00
George Hotz	7b0ce86e2a	more early compilers	2025-12-23 18:15:58 -05:00