Compare commits

...

4 commits

Author SHA1 Message Date
George Hotz
63447d50ef pickle 2025-12-23 19:34:04 -05:00
George Hotz
2621e57c53 more 2025-12-23 19:22:39 -05:00
George Hotz
8c05401d5d fix 2025-12-23 18:28:13 -05:00
George Hotz
7b0ce86e2a more early compilers 2025-12-23 18:15:58 -05:00
13 changed files with 95 additions and 31 deletions

View file

@ -334,7 +334,10 @@ class IntelRenderer(OpenCLRenderer):
class MetalRenderer(CStyleLanguage):
device = "METAL"
shared_max = 32768
def __init__(self): self.tensor_cores = tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
def __init__(self):
self.tensor_cores = tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
from tinygrad.runtime.ops_metal import MetalCompiler
self.compiler = MetalCompiler()
# language options
kernel_typedef = "kernel void"
@ -446,6 +449,18 @@ class CUDARenderer(CStyleLanguage):
return super().render_kernel(function_name, kernel, bufs, uops, prefix=prefix)
class CUDACUDARenderer(CUDARenderer):
def __init__(self, arch:str):
super().__init__(arch)
from tinygrad.runtime.support.compiler_cuda import CUDACompiler
self.compiler = CUDACompiler(arch)
class CUDANVCCRenderer(CUDARenderer):
def __init__(self, arch:str):
super().__init__(arch)
from tinygrad.runtime.support.compiler_cuda import NVCCCompiler
self.compiler = NVCCCompiler(arch)
class AMDRenderer(CStyleLanguage):
device = "AMD"
shared_max = 65536
@ -538,6 +553,32 @@ class AMDRenderer(CStyleLanguage):
for (int n = 0; n < 8; n++) { d[n] = c_frag[n*2]; } return d;\n}""")
return super().render_kernel(function_name, kernel, bufs, uops, prefix)
class AMDHIPRenderer(AMDRenderer):
def __init__(self, arch:str):
super().__init__(arch)
from tinygrad.runtime.support.compiler_amd import HIPCompiler
self.compiler = HIPCompiler(arch)
class AMDHIPCCRenderer(AMDRenderer):
def __init__(self, arch:str):
super().__init__(arch)
from tinygrad.runtime.support.compiler_amd import HIPCCCompiler
self.compiler = HIPCCCompiler(arch)
class NVRenderer(CUDARenderer): device = "NV"
class NVNVRenderer(NVRenderer):
def __init__(self, arch:str):
super().__init__(arch)
from tinygrad.runtime.support.compiler_cuda import NVCompiler
self.compiler = NVCompiler(arch)
class HIPRenderer(AMDRenderer): device = "HIP"
class HIPHIPRenderer(HIPRenderer):
def __init__(self, arch:str):
super().__init__(arch)
from tinygrad.runtime.support.compiler_amd import HIPCompiler
self.compiler = HIPCompiler(arch)
class QCOMRenderer(OpenCLRenderer): device = "QCOM"

View file

@ -143,6 +143,9 @@ class LLVMRenderer(Renderer):
if AMX: tensor_cores = tc.amx
extra_matcher = create_non_native_float_pats((dtypes.bfloat16,)) + pm_manual_bf16_cast
def __init__(self):
from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler
self.compiler = CPULLVMCompiler()
def render(self, uops: list[UOp]) -> str: return "\n".join((k:=self._render_kernel(uops))[0] + (k[1], self._render_footer(uops)))
def _render_footer(self, uops: list[UOp]) -> str: return 'attributes #0 = { alwaysinline nounwind "no-builtins" "no-trapping-math"="true" }'
def _render_fn(self, name:str, args:list[tuple[str,DType]], kernel:list[str], prefix:list[str]|None=None) -> str:
@ -254,7 +257,9 @@ exit: %packed = phi i32 [%packed_bf8, %do_bf8], [%packed_fp8, %do_fp8]\n %trunc
f'"amdgpu-flat-work-group-size"="1,{requiredMaxThreadsPerBlock}"', '"no-trapping-math"="true"']
return 'attributes #0 = { ' + ' '.join(attributes) + ' }'
def __init__(self, arch:str):
from tinygrad.runtime.support.compiler_amd import AMDLLVMCompiler
self.arch = arch
self.compiler = AMDLLVMCompiler(arch)
self.tensor_cores = AMDRenderer.get_tensor_cores(arch)
self.is_cdna = AMDRenderer.is_cdna(arch)
self.string_rewrite += PatternMatcher([(UPat(Ops.WMMA, name="wmma"), lambda ctx, wmma, cdna=self.is_cdna: render_wmma_amd(ctx, wmma, cdna))])

View file

@ -245,6 +245,11 @@ class LVPRenderer(NIRRenderer):
srcs=lambda b, self: [nsrc(nimm(b, 0, dtypes.int)), nsrc(nimm(b, self.param_idx, dtypes.int))], also=lambda self, sz:
setattr(self, "param_idx", self.param_idx+sz))(lambda self,b,x,sz: mesa.nir_intrinsic_instr_create(b.shader, mesa.nir_intrinsic_load_ubo))
def __init__(self):
from tinygrad.runtime.support.compiler_mesa import LVPCompiler
super().__init__()
self.compiler = LVPCompiler()
def prerender(self, uops:list[UOp]):
super().prerender(uops)
self.param_sz = sum([8 if u.op == Ops.DEFINE_GLOBAL else u.dtype.itemsize for u in uops if u.op in (Ops.DEFINE_GLOBAL, Ops.DEFINE_VAR)])

View file

@ -240,3 +240,17 @@ class PTXRenderer(Renderer):
if u.op is Ops.SPECIAL: kernel = [f".reg .u32 %{u.arg};"] + kernel
return self.render_kernel(kernel, name, bufs, c.items(), uops)
class CUDAPTXRenderer(PTXRenderer):
def __init__(self, arch:str):
super().__init__(arch, "CUDA")
from tinygrad.runtime.support.compiler_cuda import PTXCompiler
self.compiler = PTXCompiler(arch)
def __reduce__(self): return self.__class__, (self.arch,)
class NVPTXRenderer(PTXRenderer):
def __init__(self, arch:str):
super().__init__(arch, "NV")
from tinygrad.runtime.support.compiler_cuda import NVPTXCompiler
self.compiler = NVPTXCompiler(arch)
def __reduce__(self): return self.__class__, (self.arch,)

View file

@ -1,6 +1,7 @@
from tinygrad.dtype import DType, PtrDType, dtypes, AddrSpace
from tinygrad.uop.ops import UOp, Ops, PatternMatcher, UPat
from tinygrad.renderer.cstyle import CStyleLanguage, base_rewrite, extra_pm
from tinygrad.device import Compiler
from tinygrad.helpers import strip_parens
def sign_extend(val:UOp, sext_am:int):
@ -46,6 +47,7 @@ class WGSLRenderer(CStyleLanguage):
global_max = (65535, 65535, 65535)
local_max = (256, 256, 64)
code_for_workitem = {"g": lambda x: f"i32(gindex.{'xyz'[int(x)]})", "l": lambda x: f"i32(lindex.{'xyz'[int(x)]})"}
def __init__(self): self.compiler = Compiler()
extra_matcher = wgsl_matcher
supports_float4 = False
barrier = "workgroupBarrier();"

View file

@ -9,11 +9,10 @@ from tinygrad.uop.ops import sint
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair
from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv
from tinygrad.renderer.cstyle import AMDRenderer
from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
from tinygrad.renderer.llvmir import AMDLLVMRenderer
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt
from tinygrad.runtime.autogen.am import am
from tinygrad.runtime.support.compiler_amd import HIPCompiler, HIPCCCompiler, AMDLLVMCompiler
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.am.amdev import AMDev, AMMemoryManager
from tinygrad.runtime.support.amd import AMDReg, AMDIP, import_module, import_soc, import_ip_offsets, import_pmc
@ -931,9 +930,9 @@ class AMDDevice(HCQCompiled):
max_copy_size = 0x40000000 if self.iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x200 if self.is_usb() else (16 << 20))
compilers = CompilerSet([CompilerPair(functools.partial(AMDRenderer, self.arch), functools.partial(HIPCompiler, self.arch)),
CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), functools.partial(AMDLLVMCompiler, self.arch), AMD_LLVM),
CompilerPair(functools.partial(AMDRenderer, self.arch), functools.partial(HIPCCCompiler, self.arch))], ctrl_var=AMD_CC)
compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM),
CompilerPair(functools.partial(AMDHIPCCRenderer, self.arch), None)], ctrl_var=AMD_CC)
super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),

View file

@ -8,7 +8,6 @@ from tinygrad.runtime.support.hcq import CLikeArgsState
from tinygrad.renderer.cstyle import ClangJITRenderer
from tinygrad.renderer.llvmir import LLVMRenderer
from tinygrad.renderer.nir import LVPRenderer
from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler
from tinygrad.runtime.support.compiler_mesa import LVPCompiler
from tinygrad.runtime.support.elf import jit_loader
from tinygrad.uop.ops import sint
@ -136,6 +135,6 @@ class CPUDevice(HCQCompiled):
def __init__(self, device:str=""):
self.tasks:queue.Queue = queue.Queue()
CPUWorker(self, self.tasks, thread_id=0).start()
compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(LLVMRenderer, CPULLVMCompiler, ctrl_var=CPU_LLVM),
CompilerPair(LVPRenderer, LVPCompiler, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC)
compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(LLVMRenderer, None, ctrl_var=CPU_LLVM),
CompilerPair(LVPRenderer, None, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC)
super().__init__(device, CPUAllocator(self), compilers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)

View file

@ -2,10 +2,10 @@ from __future__ import annotations
import ctypes, functools
from tinygrad.helpers import DEBUG, getenv, mv_address, init_c_var, init_c_struct_t, suppress_finalizing, CUDA_CC, CUDA_PTX
from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerPair, CompilerSet
from tinygrad.renderer.cstyle import CUDARenderer
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import CUDACUDARenderer, CUDANVCCRenderer
from tinygrad.renderer.ptx import CUDAPTXRenderer
from tinygrad.runtime.autogen import cuda
from tinygrad.runtime.support.compiler_cuda import pretty_ptx, CUDACompiler, PTXCompiler, NVCCCompiler
from tinygrad.runtime.support.compiler_cuda import pretty_ptx
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
@ -117,9 +117,9 @@ class CUDADevice(Compiled):
CUDADevice.devices.append(self)
from tinygrad.runtime.graph.cuda import CUDAGraph
compilers = CompilerSet([CompilerPair(functools.partial(CUDARenderer, self.arch), functools.partial(CUDACompiler, self.arch)),
CompilerPair(functools.partial(PTXRenderer, self.arch), functools.partial(PTXCompiler, self.arch), CUDA_PTX),
CompilerPair(functools.partial(CUDARenderer, self.arch), functools.partial(NVCCCompiler, self.arch))], ctrl_var=CUDA_CC)
compilers = CompilerSet([CompilerPair(functools.partial(CUDACUDARenderer, self.arch), None),
CompilerPair(functools.partial(CUDAPTXRenderer, self.arch), None, CUDA_PTX),
CompilerPair(functools.partial(CUDANVCCRenderer, self.arch), None)], ctrl_var=CUDA_CC)
super().__init__(device, CUDAAllocator(self), compilers, functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)
def synchronize(self):

View file

@ -2,8 +2,7 @@ import ctypes, functools
from tinygrad.helpers import init_c_var, mv_address, init_c_struct_t, getenv
from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet, CompilerPair
from tinygrad.runtime.autogen import hip
from tinygrad.runtime.support.compiler_amd import HIPCompiler
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.renderer.cstyle import HIPHIPRenderer
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
def check(status):
@ -15,7 +14,7 @@ class HIPDevice(Compiled):
self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
compilers = CompilerSet([CompilerPair(functools.partial(HIPRenderer, self.arch), functools.partial(HIPCompiler, self.arch))])
compilers = CompilerSet([CompilerPair(functools.partial(HIPHIPRenderer, self.arch), None)])
super().__init__(device, HIPAllocator(self), compilers, functools.partial(HIPProgram, self))
def synchronize(self):
check(hip.hipSetDevice(self.device_id))

View file

@ -44,7 +44,7 @@ class MetalDevice(Compiled):
from tinygrad.runtime.graph.metal import MetalGraph
# NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
# This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
super().__init__(device, MetalAllocator(self), CompilerSet([CompilerPair(MetalRenderer, MetalCompiler), CompilerPair(MetalRenderer, Compiler)]),
super().__init__(device, MetalAllocator(self), CompilerSet([CompilerPair(MetalRenderer, None)]),
functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(self.sysdevice.name()).lower() else None)
def synchronize(self):

View file

@ -8,9 +8,8 @@ from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU
from tinygrad.uop.ops import sint
from tinygrad.device import BufferSpec, CompilerPair, CompilerSet
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import NVRenderer
from tinygrad.runtime.support.compiler_cuda import CUDACompiler, PTXCompiler, NVPTXCompiler, NVCompiler
from tinygrad.renderer.ptx import CUDAPTXRenderer, NVPTXRenderer
from tinygrad.renderer.cstyle import NVNVRenderer, CUDACUDARenderer
from tinygrad.runtime.support.compiler_mesa import NAKCompiler
from tinygrad.runtime.autogen import nv_570, nv_580, pci, mesa
from tinygrad.runtime.support.elf import elf_loader
@ -583,9 +582,9 @@ class NVDevice(HCQCompiled[HCQSignal]):
self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)
cucc, ptxcc = (CUDACompiler, PTXCompiler) if MOCKGPU else (NVCompiler, NVPTXCompiler)
compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(NVRenderer, self.arch),functools.partial(cucc, self.arch)),
CompilerPair(functools.partial(PTXRenderer, self.arch, device="NV"), functools.partial(ptxcc, self.arch), NV_PTX),
nvr, ptxr = (CUDACUDARenderer, CUDAPTXRenderer) if MOCKGPU else (NVNVRenderer, NVPTXRenderer)
compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(nvr, self.arch), None),
CompilerPair(functools.partial(ptxr, self.arch), None, NV_PTX),
CompilerPair(functools.partial(NAKRenderer, dev=self), functools.partial(NAKCompiler, self.arch, self.max_warps_per_sm), NV_NAK)])
super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), HCQSignal, NVComputeQueue, NVCopyQueue)

View file

@ -213,10 +213,14 @@ class PythonProgram:
i += 1
return time.perf_counter() - st
class PythonCompiler(Compiler):
def compile(self, src:str) -> bytes: return base64.b64decode(src)
class PythonRenderer(Renderer):
device = "PYTHON"
code_for_op = python_alu
def __init__(self):
self.compiler = PythonCompiler()
match cast(str, EMULATE.value):
case "METAL": self.device, self.tensor_cores = "METAL", tc.metal
case "AMD": self.device, self.tensor_cores = "AMD", tc.amd_rdna3
@ -235,9 +239,6 @@ class PythonRenderer(Renderer):
lops = [(u.op, u.dtype, [uops.index(v) for v in u.src if u.op is not Ops.SPECIAL], u.arg) for u in uops]
return base64.b64encode(pickle.dumps(lops)).decode()
class PythonCompiler(Compiler):
def compile(self, src:str) -> bytes: return base64.b64decode(src)
class PythonAllocator(Allocator['PythonDevice']):
def _alloc(self, size, options): return memoryview(bytearray(size))
def _copyin(self, dest, src:memoryview): dest[:] = src
@ -245,4 +246,4 @@ class PythonAllocator(Allocator['PythonDevice']):
class PythonDevice(Compiled):
def __init__(self, device:str):
super().__init__(device, PythonAllocator(self), CompilerSet([CompilerPair(PythonRenderer, PythonCompiler)]), PythonProgram)
super().__init__(device, PythonAllocator(self), CompilerSet([CompilerPair(PythonRenderer, None)]), PythonProgram)

View file

@ -1,5 +1,5 @@
import functools, struct
from tinygrad.device import Compiled, Allocator, Compiler, BufferSpec, CompilerSet, CompilerPair
from tinygrad.device import Compiled, Allocator, BufferSpec, CompilerSet, CompilerPair
from tinygrad.renderer.wgsl import WGSLRenderer
from tinygrad.helpers import round_up, suppress_finalizing
from tinygrad.runtime.autogen import webgpu
@ -215,7 +215,7 @@ class WebGpuDevice(Compiled):
device_res = _run(webgpu.wgpuAdapterRequestDeviceF, webgpu.WGPURequestDeviceCallbackInfo, webgpu.WGPURequestDeviceCallback,
webgpu.WGPURequestDeviceStatus, 1, 2, adapter_res, dev_desc)
super().__init__(device, WebGpuAllocator(device_res), CompilerSet([CompilerPair(WGSLRenderer, Compiler)]),
super().__init__(device, WebGpuAllocator(device_res), CompilerSet([CompilerPair(WGSLRenderer, None)]),
functools.partial(WebGPUProgram, (device_res, webgpu.WGPUFeatureName_TimestampQuery in supported)))
def synchronize(self):