remove CompilerPair (#14638)

This commit is contained in:
Christopher Milan 2026-02-09 16:51:18 -08:00 committed by GitHub
commit e6562a5061
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
21 changed files with 112 additions and 147 deletions

View file

@ -1,5 +1,5 @@
import unittest
from tinygrad.device import CompileError, Device, Compiler
from tinygrad.device import CompileError, Device
if Device.DEFAULT=="METAL":
from tinygrad.runtime.ops_metal import MetalDevice, MetalCompiler, MetalProgram
@unittest.skipIf(Device.DEFAULT!="METAL", "Metal support required")
@ -48,28 +48,4 @@ kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgr
""")
with self.assertRaises(RuntimeError):
compiled = compiled[:40] # corrupt the compiled program
MetalProgram(device, "r_5", compiled)
def test_program_w_empty_compiler(self):
device = MetalDevice("metal")
compiler = Compiler(device)
compiled = compiler.compile("""
#include <metal_stdlib>
kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]){
data0[0] = 0;
}
""")
MetalProgram(device, "r_5", compiled)
def test_bad_program_w_empty_compiler(self):
device = MetalDevice("metal")
compiler = Compiler(device)
# this does not raise
compiled = compiler.compile("""
#include <metal_stdlib>
kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]){
invalid codes;
}
""")
with self.assertRaises(RuntimeError):
MetalProgram(device, "r_5", compiled)

View file

@ -1,9 +1,19 @@
import unittest
from unittest.mock import patch
from tinygrad import Device
from tinygrad.device import Buffer
from tinygrad.dtype import dtypes
from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
class TestCLCompileCache(unittest.TestCase):
def test_compile_cached(self):
device = Device[Device.DEFAULT]
src = "__kernel void cached_test(__global int* a) { a[0] = 1; }"
CLProgram(device, name="cached_test", lib=src.encode())
with patch.object(CLCompiler, 'compile', side_effect=RuntimeError("compile should not be called on cache hit")):
CLProgram(device, name="cached_test", lib=src.encode())
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
class TestCLError(unittest.TestCase):
@unittest.skip("allocates tons of memory")
@ -17,7 +27,7 @@ class TestCLError(unittest.TestCase):
def test_invalid_kernel_name(self):
device = Device[Device.DEFAULT]
with self.assertRaises(RuntimeError) as err:
CLProgram(device, name="", lib=CLCompiler(device, "test").compile("__kernel void test(__global int* a) { a[0] = 1; }"))
CLProgram(device, name="", lib="__kernel void test(__global int* a) { a[0] = 1; }".encode())
assert str(err.exception) == "OpenCL Error -46: CL_INVALID_KERNEL_NAME"
def test_unaligned_copy(self):

View file

@ -139,7 +139,6 @@ def do_render(ctx:Renderer, prg:UOp, lin:UOp) -> UOp:
return prg.replace(src=prg.src + (UOp(Ops.SOURCE, arg=src),), arg=ctx.aux(list(lin.src)) if ctx.has_aux else prg.arg)
def do_compile(ctx:Renderer, prg:UOp, source:UOp) -> UOp|None:
if ctx.compiler is None: return None
lib = ctx.compiler.compile_cached(source.arg)
return prg.replace(src=prg.src + (UOp(Ops.BINARY, arg=lib),))

View file

@ -1,14 +1,14 @@
from __future__ import annotations
from dataclasses import dataclass, replace
from collections import defaultdict
from typing import Any, Generic, TypeVar, Iterator, Generator
from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
from tinygrad.helpers import EMULATED_DTYPES
from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
from tinygrad.renderer import Renderer
if TYPE_CHECKING: from tinygrad.renderer import Renderer
# **************** Device ****************
@ -278,39 +278,34 @@ class Compiler:
def disassemble(self, lib:bytes): pass
@dataclass(frozen=True)
class CompilerPair:
renderer:type[Renderer]|functools.partial; compiler:type[Compiler]|functools.partial|None = None; ctrl_var:ContextVar|None = None # noqa: E702
name:str|None = None
@dataclass(frozen=True)
class CompilerSet: cset:list[CompilerPair]; ctrl_var:ContextVar|None = None # noqa: E702
class CompilerSet: cset:list[tuple[type[Renderer]|functools.partial, ContextVar|None]]; ctrl_var:ContextVar|None = None # noqa: E702
class Compiled:
profile_events:list[ProfileEvent] = [ProfileDeviceEvent("CPU")] # NOTE: CPU is the default device.
def __init__(self, device:str, allocator:Allocator, compilers:CompilerSet|None, runtime, graph=None, group_id=None):
from tinygrad.renderer import Renderer
self.device, self.allocator, self.runtime, self.graph, self.group_id = device, allocator, runtime, graph, group_id
self.comps_ctrl_var = compilers.ctrl_var if compilers is not None else None
self.comp_sets:dict[Any, tuple[ContextVar|None, tuple[type[Renderer]|functools.partial, type[Compiler]|functools.partial|None]]] = {}
self.cached_pair:dict[Any, tuple[Renderer, Compiler|None]] = {}
for cpair in (compilers.cset if compilers is not None else [CompilerPair(Renderer, Compiler)]):
self.comp_sets[cpair.name or self._compiler_name(cpair.renderer, cpair.compiler)] = (cpair.ctrl_var, (cpair.renderer, cpair.compiler))
self.comp_sets:dict[str, tuple[ContextVar|None, type[Renderer]|functools.partial]] = {}
self.cached_pair:dict[Any, Renderer] = {}
for ren, var in (compilers.cset if compilers is not None else [(Renderer, None)]):
self.comp_sets[var.key.split('_', 1)[-1] if var is not None else self._compiler_name(ren)] = (var, ren)
@property
def renderer(self) -> Renderer: return self._select_compiler_pair()[0]
def renderer(self) -> Renderer: return self._select_compiler_pair()
@property
def compiler(self) -> Compiler:
if (ret:=self.renderer.compiler or self._select_compiler_pair()[1]) is None: raise RuntimeError(f"no compiler for {self.device}")
if (ret:=self.renderer.compiler) is None: raise RuntimeError(f"no compiler for {self.device}")
return ret
def _compiler_name(self, r:type[Renderer]|functools.partial, c:type[Compiler]|functools.partial|None) -> str:
devname = self.device.split(':')[0].upper()
if c is None: return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname) or devname
return unwrap_class_type(c).__name__.upper().removesuffix("COMPILER").removeprefix(devname) or devname
def _compiler_name(self, r:type[Renderer]|functools.partial) -> str:
return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname:=self.device.split(':')[0].upper()) or devname
def _select_compiler_pair(self) -> tuple[Renderer, Compiler|None]:
def _select_compiler_pair(self) -> Renderer:
# select forced compiler from global env var.
forced_comps = set([self.comp_sets[val][1]] if self.comps_ctrl_var is not None and (val:=self.comps_ctrl_var.value) else [])
@ -399,18 +394,18 @@ def enumerate_devices_str() -> Generator[str, None, None]:
d = Device[device]
default_comp_pairs, default_compiler, cc_ctrl_var = d.comp_sets, d.compiler, d.comps_ctrl_var
try:
for k,(en,(r,c)) in default_comp_pairs.items():
d.comp_sets = {k:(None,(r,c))} # env var set to None, so it doesn't interfere
for k,(en,r) in default_comp_pairs.items():
d.comp_sets = {k:(None,r)} # env var set to None, so it doesn't interfere
d.comps_ctrl_var = None
try:
# d.renderer, d.compiler = r(), c()
with Context(CACHELEVEL=0): test = (Tensor([1,2,3], device=device) * 2).tolist()
if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
set_text = f'({cc_ctrl_var.key}={d._compiler_name(r, c)} to make default)' if cc_ctrl_var is not None else ''
set_text = f'({cc_ctrl_var.key}={d._compiler_name(r)} to make default)' if cc_ctrl_var is not None else ''
default_text = '(default)' if type(default_compiler) is type(d.compiler) else set_text
compilers_results.append(f"{colored('+', 'green')} {d._compiler_name(r, c)} {default_text}")
compilers_results.append(f"{colored('+', 'green')} {d._compiler_name(r)} {default_text}")
any_works = True
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._compiler_name(r, c)}: {e}")
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._compiler_name(r)}: {e}")
finally:
# put the defaults back!
d.comp_sets, d.comps_ctrl_var = default_comp_pairs, cc_ctrl_var

View file

@ -185,11 +185,12 @@ ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE = ContextVar("ALLOW_DEVICE_USAGE", 1), Conte
EMULATE, EMULATED_DTYPES = ContextVar("EMULATE", ""), ContextVar("EMULATED_DTYPES", "")
CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
# Compilers
CPU_LLVM, CPU_LVP, AMD_LLVM = ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0), ContextVar("AMD_LLVM", 0)
NV_PTX, CUDA_PTX, NV_NAK, QCOM_IR3 = ContextVar("NV_PTX", 0), ContextVar("CUDA_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("QCOM_IR3", 0)
CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0)
CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
AMD_CC, CPU_CC, NV_CC, CUDA_CC = ContextVar("AMD_CC", ""), ContextVar("CPU_CC", ""), ContextVar("NV_CC", ""), ContextVar("CUDA_CC", "")
QCOM_CC = ContextVar("QCOM_CC", "")
AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
# VIZ implies PROFILE, but you can run PROFILE without VIZ
VIZ = ContextVar("VIZ", 0)
PROFILE = ContextVar("PROFILE", abs(VIZ.value))

View file

@ -1,5 +1,5 @@
from __future__ import annotations
from typing import Callable, cast, TYPE_CHECKING
from typing import Callable, cast
import functools
from dataclasses import dataclass, field
from tinygrad.helpers import to_function_name, dedup, prod, DEBUG
@ -7,7 +7,7 @@ from tinygrad.uop.ops import Ops, UOp, sym_infer, sint, Variable, ssimplify, Gro
from tinygrad.dtype import AddrSpace, PtrDType
from tinygrad.codegen.opt.tc import TensorCore
from tinygrad.codegen.opt import Opt
if TYPE_CHECKING: from tinygrad.device import Compiler
from tinygrad.device import Compiler
@dataclass(frozen=True)
class Estimates:
@ -150,7 +150,8 @@ class Renderer:
pre_matcher: PatternMatcher|None = None
extra_matcher: PatternMatcher|None = None
code_for_op: dict[Ops, Callable] = {}
compiler: Compiler|None = None
compiler: Compiler = Compiler()
def __reduce__(self): return self.__class__, ()
def render(self, uops:list[UOp]) -> str: raise NotImplementedError("needs a renderer")

View file

@ -3,10 +3,9 @@ import os, math, sys, struct
from collections import defaultdict, Counter
from tinygrad.codegen.opt import tc
from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str, axis_letters
from tinygrad.helpers import strip_parens, getenv, prod, dedup, select_first_inited, AMX, CPU_COUNT
from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX, CPU_COUNT
from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, AddrSpace, truncate, float_to_bf16
from tinygrad.renderer import Renderer
from tinygrad.device import Compiler
from tinygrad.codegen.late.devectorizer import no_vectorized_alu
@ -343,8 +342,7 @@ class MetalRenderer(CStyleLanguage):
shared_max = 32768
def __init__(self):
from tinygrad.runtime.ops_metal import MetalCompiler
self.compiler = select_first_inited([MetalCompiler, Compiler], "No compiler for METAL is available")
self.tensor_cores = tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
self.compiler, self.tensor_cores = MetalCompiler(), tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
# language options
kernel_typedef = "kernel void"

View file

@ -115,7 +115,7 @@ def __getattr__(nm):
return load("rocprof", "['rocprof-trace-decoder', p:='/usr/local/lib/rocprof-trace-decoder.so', p.replace('so','dylib')]",
[f"{{}}/include/{s}.h" for s in ["rocprof_trace_decoder", "trace_decoder_instrument", "trace_decoder_types"]],
tarball="https://github.com/ROCm/rocprof-trace-decoder/archive/dd0485100971522cc4cd8ae136bdda431061a04d.tar.gz")
case "mesa": return load("mesa", "['tinymesa_cpu', 'tinymesa']", [
case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu']", [
*[f"{{}}/src/compiler/nir/{s}.h" for s in ["nir", "nir_builder", "nir_shader_compiler_options", "nir_serialize"]], "{}/gen/nir_intrinsics.h",
*[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
*[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
@ -134,7 +134,8 @@ def __getattr__(nm):
*[f"python3 src/compiler/{s}_h.py > gen/{s.split('/')[-1]}.h" for s in ["nir/nir_opcodes", "nir/nir_builder_opcodes"]],
*[f"python3 src/compiler/nir/nir_{s}_h.py --outdir gen" for s in ["intrinsics", "intrinsics_indices"]]]), cwd=path, shell=True, check=True),
tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
prolog=["import gzip, base64"], epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
prolog=["from tinygrad.helpers import CPU_CC, CPU_LVP", "import gzip, base64"],
epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
case "libclang":
return load("libclang", clang_lib,
lambda: [f"{system('llvm-config-20 --includedir')}/clang-c/{s}.h" for s in ["Index", "CXString", "CXSourceLocation", "CXFile"]],

View file

@ -4,8 +4,9 @@ import ctypes
from typing import Annotated, Literal, TypeAlias
from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
from tinygrad.runtime.support import c
from tinygrad.helpers import CPU_CC, CPU_LVP
import gzip, base64
dll = c.DLL('mesa', ['tinymesa_cpu', 'tinymesa'])
dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu'])
class struct_u_printf_info(ctypes.Structure): pass
u_printf_info: TypeAlias = struct_u_printf_info
uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32]

View file

@ -6,9 +6,9 @@ from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filter_visible_devices
from tinygrad.uop.ops import sint
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet
from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv, unwrap
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, AMD_HIPCC, ceildiv, unwrap
from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
from tinygrad.renderer.llvmir import AMDLLVMRenderer
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt, amdgpu_kd, amdgpu_drm
@ -962,9 +962,9 @@ class AMDDevice(HCQCompiled):
self.sdma_queues:dict = {}
self.has_sdma_queue = self.sdma_queue(0) is not None
compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM),
CompilerPair(functools.partial(AMDHIPCCRenderer, self.arch), None)], ctrl_var=AMD_CC)
compilers = CompilerSet([(functools.partial(AMDHIPRenderer, self.arch), None),
(functools.partial(AMDLLVMRenderer, self.arch), AMD_LLVM),
(functools.partial(AMDHIPCCRenderer, self.arch), AMD_HIPCC)], ctrl_var=AMD_CC)
super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),

View file

@ -5,7 +5,7 @@ from tinygrad.runtime.autogen import opencl as cl
from tinygrad.runtime.support import c
from tinygrad.helpers import to_char_p_p, from_mv, OSX, DEBUG, mv_address, suppress_finalizing
from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError, CompilerPair, CompilerSet
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError, CompilerSet
from tinygrad.dtype import ImageDType
CC_CB = c.CFUNCTYPE[None, [c.POINTER[ctypes.c_char], c.POINTER[None], cl.size_t, c.POINTER[None]]]
@ -39,9 +39,9 @@ class CLCompiler(Compiler):
class CLProgram:
def __init__(self, device:CLDevice, name:str, lib:bytes, buf_dtypes=[], **kwargs):
self.dev, self.name, self.lib, self.buf_dtypes = device, name, lib, buf_dtypes
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
self.dev, self.name, self.lib, self.buf_dtypes = device, name, device.clc.compile_cached(lib.decode()), buf_dtypes
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(self.lib)),
to_char_p_p([self.lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
errcode_ret := ctypes.c_int32()), errcode_ret)
check(binary_status.value)
check(cl.clBuildProgram(self.program, 1, device.device_id, None, BP_CB(), None)) # NOTE: OSX requires this
@ -125,8 +125,8 @@ class CLDevice(Compiled):
ctypes.string_at(buf, size=total.value).decode())[1]
renderer = IntelRenderer if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts else OpenCLRenderer
compiler = functools.partial(CLCompiler, self, f"{hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()}")
super().__init__(device, CLAllocator(self), CompilerSet([CompilerPair(renderer, compiler)]), functools.partial(CLProgram, self))
self.clc = CLCompiler(self, f"{hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()}")
super().__init__(device, CLAllocator(self), CompilerSet([(renderer, None)]), functools.partial(CLProgram, self))
def synchronize(self):
check(cl.clFinish(self.queue))
self.pending_copyin.clear()

View file

@ -2,7 +2,7 @@ from __future__ import annotations
import platform, sys, ctypes, functools, time, mmap, threading, queue
from tinygrad.helpers import to_mv, OSX, WIN, mv_address, wait_cond, suppress_finalizing, unwrap, data64_le
from tinygrad.helpers import CPU_CC, CPU_LVP, CPU_LLVM
from tinygrad.device import BufferSpec, DMACPURef, CompilerSet, CompilerPair
from tinygrad.device import BufferSpec, DMACPURef, CompilerSet
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
from tinygrad.runtime.support.hcq import CLikeArgsState
from tinygrad.renderer.cstyle import ClangJITRenderer
@ -133,6 +133,5 @@ class CPUDevice(HCQCompiled):
def __init__(self, device:str=""):
self.tasks:queue.Queue = queue.Queue()
CPUWorker(self, self.tasks, thread_id=0).start()
compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(CPULLVMRenderer, None, ctrl_var=CPU_LLVM),
CompilerPair(LVPRenderer, None, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC)
compilers = CompilerSet([(ClangJITRenderer, None), (CPULLVMRenderer, CPU_LLVM), (LVPRenderer, CPU_LVP)], ctrl_var=CPU_CC)
super().__init__(device, CPUAllocator(self), compilers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)

View file

@ -1,7 +1,7 @@
from __future__ import annotations
import ctypes, functools
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing, CUDA_CC, CUDA_PTX
from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerPair, CompilerSet
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing, CUDA_CC, CUDA_PTX, CUDA_NVCC
from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerSet
from tinygrad.renderer.cstyle import CUDARenderer
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.runtime.autogen import cuda
@ -118,9 +118,9 @@ class CUDADevice(Compiled):
CUDADevice.devices.append(self)
from tinygrad.runtime.graph.cuda import CUDAGraph
compilers = CompilerSet([CompilerPair(functools.partial(CUDARenderer, self.arch, device="CUDA")),
CompilerPair(functools.partial(PTXRenderer, self.arch, device="CUDA"), ctrl_var=CUDA_PTX),
CompilerPair(functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True), name="NVCC")], ctrl_var=CUDA_CC)
compilers = CompilerSet([(functools.partial(CUDARenderer, self.arch, device="CUDA"), None),
(functools.partial(PTXRenderer, self.arch, device="CUDA"), CUDA_PTX),
(functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True), CUDA_NVCC)], ctrl_var=CUDA_CC)
super().__init__(device, CUDAAllocator(self), compilers, functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)
def synchronize(self):

View file

@ -1,7 +1,7 @@
from __future__ import annotations
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
assert sys.platform != 'win32'
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, CompilerSet, CompilerPair
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, CompilerSet
from tinygrad.dtype import dtypes, DType, PtrDType
from tinygrad.uop.ops import Ops, UOp
from tinygrad.helpers import getenv, round_up, mv_address, to_mv, cpu_objdump, system, DEBUG, suppress_finalizing
@ -146,10 +146,10 @@ class DSPCompiler(Compiler):
class DSPDevice(Compiled):
def __init__(self, device:str=""):
if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), CompilerSet([CompilerPair(MockDSPRenderer)]), MockDSPProgram)
if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), CompilerSet([(MockDSPRenderer, None)]), MockDSPProgram)
else:
self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
super().__init__(device, DSPAllocator(self), CompilerSet([CompilerPair(DSPRenderer)]), functools.partial(DSPProgram, self))
super().__init__(device, DSPAllocator(self), CompilerSet([(DSPRenderer, None)]), functools.partial(DSPProgram, self))
fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferSpec(nolru=True))
ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)

View file

@ -1,6 +1,6 @@
import ctypes, functools
from tinygrad.helpers import mv_address, getenv, suppress_finalizing
from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet, CompilerPair
from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet
from tinygrad.runtime.autogen import hip
from tinygrad.renderer.cstyle import HIPRenderer
from tinygrad.runtime.support.c import init_c_var, init_c_struct_t
@ -15,7 +15,7 @@ class HIPDevice(Compiled):
self.arch = init_c_var(hip.hipDeviceProp_t, lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t, lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
compilers = CompilerSet([CompilerPair(functools.partial(HIPRenderer, self.arch), None)])
compilers = CompilerSet([(functools.partial(HIPRenderer, self.arch), None)])
super().__init__(device, HIPAllocator(self), compilers, functools.partial(HIPProgram, self))
def synchronize(self):
check(hip.hipSetDevice(self.device_id))

View file

@ -1,7 +1,7 @@
import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
import subprocess, pathlib, struct, ctypes, tempfile, functools, decimal, platform
from tinygrad.helpers import prod, to_mv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
import tinygrad.runtime.support.objc as objc
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet, CompilerPair
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet
from tinygrad.renderer.cstyle import MetalRenderer
from tinygrad.runtime.autogen import metal
from tinygrad.runtime.support.c import DLL
@ -42,7 +42,7 @@ class MetalDevice(Compiled):
from tinygrad.runtime.graph.metal import MetalGraph
# NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
# This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
super().__init__(device, MetalAllocator(self), CompilerSet([CompilerPair(MetalRenderer, None)]),
super().__init__(device, MetalAllocator(self), CompilerSet([(MetalRenderer, None)]),
functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(self.sysdevice.name()).lower() else None)
def synchronize(self):
@ -54,20 +54,12 @@ class MetalDevice(Compiled):
Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en)]
self.mtl_buffers_in_flight.clear()
def metal_src_to_library(device:MetalDevice, src:str) -> metal.MTLLibrary:
options = metal.MTLCompileOptions.new()
options.setFastMathEnabled(getenv("METAL_FAST_MATH"))
library = device.sysdevice.newLibraryWithSource_options_error(to_ns_str(src), options, ctypes.byref(compileError:=metal.NSError().retained()))
error_check(compileError, CompileError)
return library
class MetalCompiler(Compiler):
# Opening METAL after LLVM doesn't fail because ctypes.CDLL opens with RTLD_LOCAL but MTLCompiler opens it's own llvm with RTLD_GLOBAL
# This means that MTLCompiler's llvm will create it's own instances of global state because RTLD_LOCAL doesn't export symbols, but if RTLD_GLOBAL
# library is loaded first then RTLD_LOCAL library will just use it's symbols. On linux there is RTLD_DEEPBIND to prevent that, but on macos there
# doesn't seem to be anything we can do.
with contextlib.suppress(FileNotFoundError, ModuleNotFoundError):
import tinygrad.runtime.autogen.llvm # noqa: F401
import tinygrad.runtime.autogen.llvm as _
support = DLL("MTLCompiler", "MTLCompiler")
support.MTLCodeGenServiceCreate.restype = ctypes.c_void_p
@ -118,15 +110,9 @@ class MetalCompiler(Compiler):
class MetalProgram:
def __init__(self, dev:MetalDevice, name:str, lib:bytes, **kwargs):
self.dev, self.name, self.lib = dev, name, lib
if lib[:4] == b"MTLB":
# binary metal library
data = objc.dispatch_data_create(lib, len(lib), None, None)
self.library = self.dev.sysdevice.newLibraryWithData_error(data, ctypes.byref(error_lib:=metal.NSError().retained())).retained()
error_check(error_lib)
else:
# metal source. rely on OS caching
try: self.library = metal_src_to_library(self.dev, lib.decode())
except CompileError as e: raise RuntimeError from e
data = objc.dispatch_data_create(lib, len(lib), None, None)
self.library = self.dev.sysdevice.newLibraryWithData_error(data, ctypes.byref(error_lib:=metal.NSError().retained())).retained()
error_check(error_lib)
self.fxn = self.library.newFunctionWithName(to_ns_str(name)).retained()
descriptor = metal.MTLComputePipelineDescriptor.new()
descriptor.setComputeFunction(self.fxn)

View file

@ -1,5 +1,5 @@
import functools
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
from tinygrad.device import Compiled, Allocator, CompilerSet
from tinygrad.engine.jit import MultiGraphRunner
from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer
from tinygrad.uop.ops import Ops
@ -39,6 +39,6 @@ class NullDevice(Compiled):
case "AMD_CDNA4": renderer = functools.partial(AMDHIPRenderer, "gfx950")
case "": renderer = NullRenderer
case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630
CompilerPair(functools.partial(NAKRenderer, "sm_120", 48), None, NULL_NAK)]) # 5090
compilers = CompilerSet([(renderer, None), (functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
(functools.partial(NAKRenderer, "sm_120", 48), NULL_NAK)]) # 5090
super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph)

View file

@ -6,7 +6,7 @@ from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
from tinygrad.uop.ops import sint
from tinygrad.device import Compiled, BufferSpec, CompilerPair, CompilerSet
from tinygrad.device import Compiled, BufferSpec, CompilerSet
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
from tinygrad.renderer.ptx import PTXRenderer
@ -619,9 +619,9 @@ class NVDevice(HCQCompiled[NVSignal]):
self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)
compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(CUDARenderer, self.arch)),
CompilerPair(functools.partial(PTXRenderer, self.arch, device="NV"), ctrl_var=NV_PTX),
CompilerPair(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), ctrl_var=NV_NAK)])
compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
(functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)])
super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
self.pma_enabled = PMA.value > 0 and PROFILE >= 1

View file

@ -6,7 +6,7 @@ from typing import Any, TYPE_CHECKING
import pickle, base64, itertools, time, struct, sys, functools
from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate, float_to_fp16, float_to_bf16, float_to_fp8, fp8_to_float
from tinygrad.helpers import all_same, getenv, flatten, get_single_element, EMULATE
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet
from tinygrad.codegen.opt import tc
from tinygrad.uop.ops import exec_alu, python_alu, Ops, UOp, GroupOp
from tinygrad.renderer import Renderer
@ -248,4 +248,4 @@ class PythonAllocator(Allocator['PythonDevice']):
class PythonDevice(Compiled):
def __init__(self, device:str):
super().__init__(device, PythonAllocator(self), CompilerSet([CompilerPair(PythonRenderer, None)]), PythonProgram)
super().__init__(device, PythonAllocator(self), CompilerSet([(PythonRenderer, None)]), PythonProgram)

View file

@ -2,15 +2,15 @@ from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
assert sys.platform != 'win32'
from typing import Any
from tinygrad.device import BufferSpec, CompilerSet, CompilerPair, Device
from tinygrad.device import BufferSpec, CompilerSet, Device
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
from tinygrad.runtime.autogen import kgsl, mesa
from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
from tinygrad.runtime.ops_cl import CLDevice
from tinygrad.renderer.cstyle import QCOMRenderer
from tinygrad.renderer.nir import IR3Renderer
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, fromimport, cpu_profile, lo32, suppress_finalizing
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE, DEBUG
from tinygrad.dtype import ImageDType, dtypes
from tinygrad.runtime.support.system import System
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
@ -49,10 +49,6 @@ def pkt7_hdr(opcode: int, cnt: int): return mesa.CP_TYPE7_PKT | cnt & 0x3FFF | p
def pkt4_hdr(reg: int, cnt: int): return mesa.CP_TYPE4_PKT | cnt & 0x7F | parity(cnt) << 7 | (reg & 0x3FFFF) << 8 | parity(reg) << 27
def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
class QCOMCompiler(CLCompiler):
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
def disassemble(self, lib:bytes):
fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
class QCOMSignal(HCQSignal):
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 19.2})
@ -231,7 +227,7 @@ class QCOMArgsState(HCQArgsState):
class QCOMProgram(HCQProgram):
def __init__(self, dev: QCOMDevice, name: str, lib: bytes, buf_dtypes=[], **kwargs):
self.dev: QCOMDevice = dev
self.buf_dtypes, self.name, self.lib, self.NIR = buf_dtypes, name, lib, isinstance(dev.renderer, IR3Renderer)
self.buf_dtypes, self.name, self.NIR = buf_dtypes, name, isinstance(dev.renderer, IR3Renderer)
if self.NIR:
from tinygrad.runtime.support.compiler_mesa import IR3Compiler
@ -252,7 +248,9 @@ class QCOMProgram(HCQProgram):
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
self.consts_info:list[tuple] = []
else: self._parse_lib()
else:
self._parse_lib(lib:=self.dev.cl_dev.clc.compile_cached(lib.decode()))
if DEBUG >= 7: fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
@ -274,21 +272,21 @@ class QCOMProgram(HCQProgram):
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
def _parse_lib(self):
def _parse_lib(self, lib):
# Extract image binary
self.image_size = _read_lib(self.lib, 0x100)
self.image = bytearray(self.lib[(image_offset:=_read_lib(self.lib, 0xc0)):image_offset+self.image_size])
self.image_size = _read_lib(lib, 0x100)
self.image = bytearray(lib[(image_offset:=_read_lib(lib, 0xc0)):image_offset+self.image_size])
# Parse image descriptors
image_desc_off = _read_lib(self.lib, 0x110)
self.prg_offset, self.brnchstck = _read_lib(self.lib, image_desc_off+0xc4), _read_lib(self.lib, image_desc_off+0x108) // 2
self.pvtmem, self.shmem = _read_lib(self.lib, image_desc_off+0xc8), _read_lib(self.lib, image_desc_off+0xd8)
image_desc_off = _read_lib(lib, 0x110)
self.prg_offset, self.brnchstck = _read_lib(lib, image_desc_off+0xc4), _read_lib(lib, image_desc_off+0x108) // 2
self.pvtmem, self.shmem = _read_lib(lib, image_desc_off+0xc8), _read_lib(lib, image_desc_off+0xd8)
# Fill up constants and buffers info
self.consts_info = []
# Collect sampler info.
self.samp_cnt = samp_cnt_in_file = _read_lib(self.lib, image_desc_off + 0xdc)
self.samp_cnt = samp_cnt_in_file = _read_lib(lib, image_desc_off + 0xdc)
assert self.samp_cnt <= 1, "Up to one sampler supported"
if self.samp_cnt:
self.samp_cnt += 1
@ -298,8 +296,8 @@ class QCOMProgram(HCQProgram):
# Collect kernel arguments (buffers) info.
bdoff, binfos = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samp_cnt_in_file, []
while bdoff + 32 <= len(self.lib):
length, _, _, offset_words, _, _, _, typ = struct.unpack("8I", self.lib[bdoff:bdoff+32])
while bdoff + 32 <= len(lib):
length, _, _, offset_words, _, _, _, typ = struct.unpack("8I", lib[bdoff:bdoff+32])
if length == 0: break
binfos.append((offset_words * 4, typ))
bdoff += length
@ -309,16 +307,16 @@ class QCOMProgram(HCQProgram):
self.tex_cnt, self.ibo_cnt = sum(typ is BUFTYPE_TEX for _,typ in binfos), sum(typ is BUFTYPE_IBO for _,typ in binfos)
self.ibo_off, self.tex_off, self.samp_off = 2048, 2048 + 0x40 * self.ibo_cnt, 2048 + 0x40 * self.tex_cnt + 0x40 * self.ibo_cnt
if _read_lib(self.lib, 0xb0) != 0: # check if we have constants.
cdoff = _read_lib(self.lib, 0xac)
if _read_lib(lib, 0xb0) != 0: # check if we have constants.
cdoff = _read_lib(lib, 0xac)
while cdoff + 40 <= image_offset:
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
cnst, offset_words, _, is32 = struct.unpack("I", lib[cdoff:cdoff+4])[0], *struct.unpack("III", lib[cdoff+16:cdoff+28])
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
cdoff += 40
# Registers info
reg_desc_off = _read_lib(self.lib, 0x34)
self.fregs, self.hregs = _read_lib(self.lib, reg_desc_off + 0x14), _read_lib(self.lib, reg_desc_off + 0x18)
reg_desc_off = _read_lib(lib, 0x34)
self.fregs, self.hregs = _read_lib(lib, reg_desc_off + 0x14), _read_lib(lib, reg_desc_off + 0x18)
class QCOMTextureInfo:
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
@ -385,8 +383,8 @@ class QCOMDevice(HCQCompiled):
if PROFILE and self.gpu_id[:2] < (7, 3):
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[CompilerPair(QCOMRenderer, functools.partial(QCOMCompiler, device)),
CompilerPair(functools.partial(IR3Renderer, info.chip_id), None, QCOM_IR3)])
self.cl_dev = CLDevice(device)
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(QCOMRenderer, None), (functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
functools.partial(QCOMComputeQueue, self), None)

View file

@ -1,5 +1,5 @@
import functools, struct
from tinygrad.device import Compiled, Allocator, Compiler, BufferSpec, CompilerSet, CompilerPair
from tinygrad.device import Compiled, Allocator, BufferSpec, CompilerSet
from tinygrad.renderer.wgsl import WGSLRenderer
from tinygrad.helpers import round_up, suppress_finalizing
from tinygrad.runtime.autogen import webgpu
@ -217,7 +217,7 @@ class WebGpuDevice(Compiled):
self.device_res = _run(webgpu.wgpuAdapterRequestDeviceF, webgpu.WGPURequestDeviceCallbackInfo, webgpu.WGPURequestDeviceCallback,
webgpu.WGPURequestDeviceStatus, 1, 2, adapter_res, dev_desc)
super().__init__(device, WebGpuAllocator(self), CompilerSet([CompilerPair(WGSLRenderer, Compiler)]),
super().__init__(device, WebGpuAllocator(self), CompilerSet([(WGSLRenderer, None)]),
functools.partial(WebGPUProgram, (self.device_res, webgpu.WGPUFeatureName_TimestampQuery in supported)))
def synchronize(self):