mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
replace CompilerSet with list (#15530)
* replace CompilerSet with list * oops * default Renderer list
This commit is contained in:
parent
bc866a93f0
commit
6fb038d109
17 changed files with 78 additions and 90 deletions
|
|
@ -265,50 +265,39 @@ class Compiler:
|
|||
return lib
|
||||
def disassemble(self, lib:bytes): pass
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CompilerSet: cset:list[tuple[type[Renderer]|functools.partial, ContextVar|None]]; ctrl_var:ContextVar|None = None # noqa: E702
|
||||
|
||||
class Compiled:
|
||||
profile_events:list[ProfileEvent] = [ProfileDeviceEvent("CPU")] # NOTE: CPU is the default device.
|
||||
|
||||
def __init__(self, device:str, allocator:Allocator, compilers:CompilerSet|None, runtime, graph=None):
|
||||
def __init__(self, device:str, allocator:Allocator, renderers:list[type[Renderer]|functools.partial], runtime, graph=None):
|
||||
from tinygrad.renderer import Renderer
|
||||
|
||||
self.device, self.allocator, self.runtime, self.graph = device, allocator, runtime, graph
|
||||
|
||||
self.comps_ctrl_var = compilers.ctrl_var if compilers is not None else None
|
||||
self.comp_sets:dict[str, tuple[ContextVar|None, type[Renderer]|functools.partial]] = {}
|
||||
self.cached_pair:dict[Any, Renderer] = {}
|
||||
for ren, var in (compilers.cset if compilers is not None else [(Renderer, None)]):
|
||||
self.comp_sets[var.key.split('_', 1)[-1] if var is not None else self._compiler_name(ren)] = (var, ren)
|
||||
self.device, self.allocator, self.runtime, self.graph, self.renderers = device, allocator, runtime, graph, renderers or [Renderer]
|
||||
self.cached_renderer:dict[Any, Renderer] = {}
|
||||
|
||||
@property
|
||||
def renderer(self) -> Renderer: return self._select_compiler_pair()
|
||||
def renderer(self) -> Renderer: return self._select_renderer()
|
||||
|
||||
@property
|
||||
def compiler(self) -> Compiler:
|
||||
if (ret:=self.renderer.compiler) is None: raise RuntimeError(f"no compiler for {self.device}")
|
||||
return ret
|
||||
|
||||
def _compiler_name(self, r:type[Renderer]|functools.partial) -> str:
|
||||
def _renderer_name(self, r:type[Renderer]|functools.partial) -> str:
|
||||
return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname:=self.device.split(':')[0].upper()) or devname
|
||||
|
||||
def _select_compiler_pair(self) -> Renderer:
|
||||
def _renderer_var(self, r:type[Renderer]|functools.partial) -> ContextVar|None:
|
||||
return ContextVar._cache.get(f"{self.device}_{self._renderer_name(r)}", None)
|
||||
|
||||
def _select_renderer(self) -> Renderer:
|
||||
# select forced compiler from global env var.
|
||||
forced_comps = set([self.comp_sets[val][1]] if self.comps_ctrl_var is not None and (val:=self.comps_ctrl_var.value) else [])
|
||||
forced_comps = set([r for r in self.renderers if self._renderer_name(r) == val] if
|
||||
(ctrl:=ContextVar._cache.get(f"{self.device}_CC", None)) is not None and (val:=ctrl.value) else [])
|
||||
|
||||
# add forced compilers from individual env vars (only if global env var is not set, as it takes precedence).
|
||||
if not forced_comps: forced_comps |= set(rc for en, rc in self.comp_sets.values() if en is not None and en.value == 1)
|
||||
if not forced_comps: forced_comps |= set(r for r in self.renderers if (en:=self._renderer_var(r)) is not None and en.value == 1)
|
||||
if len(forced_comps) > 1: raise RuntimeError(f"{self.device}: multiple compilers set in env {forced_comps}")
|
||||
|
||||
# select remaining compilers (all or forced only)
|
||||
comps = list(rc for en, rc in self.comp_sets.values())
|
||||
|
||||
# remove disabled compilers
|
||||
for en, rc in self.comp_sets.values():
|
||||
if en is not None and en.value == 0 and en.key in os.environ and rc in comps: comps.remove(rc)
|
||||
|
||||
return select_first_inited(list(forced_comps) if len(forced_comps)>0 else comps, f"No compiler for {self.device} is available", self.cached_pair)
|
||||
return select_first_inited(list(forced_comps) if len(forced_comps)>0 else self.renderers, f"No renderer for {self.device} is available",
|
||||
self.cached_renderer)
|
||||
|
||||
def synchronize(self):
|
||||
"""
|
||||
|
|
@ -384,23 +373,23 @@ def enumerate_devices_str() -> Generator[str, None, None]:
|
|||
compilers_results, any_works = [], False
|
||||
try:
|
||||
d = Device[device]
|
||||
default_comp_pairs, default_compiler, cc_ctrl_var = d.comp_sets, d.compiler, d.comps_ctrl_var
|
||||
default_renderers, default_renderer = d.renderers, d.renderer
|
||||
try:
|
||||
for k,(en,r) in default_comp_pairs.items():
|
||||
d.comp_sets = {k:(None,r)} # env var set to None, so it doesn't interfere
|
||||
d.comps_ctrl_var = None
|
||||
for r in default_renderers:
|
||||
d.renderers = [r]
|
||||
try:
|
||||
# d.renderer, d.compiler = r(), c()
|
||||
with Context(CACHELEVEL=0): test = (Tensor([1,2,3], device=device) * 2).tolist()
|
||||
with Context(CACHELEVEL=0, **({f"{device}_CC": d._renderer_name(r)} if (ctrl:=f"{device}_CC") in ContextVar._cache else {})):
|
||||
test = (Tensor([1,2,3], device=device) * 2).tolist()
|
||||
if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
|
||||
set_text = f'({cc_ctrl_var.key}={d._compiler_name(r)} to make default)' if cc_ctrl_var is not None else ''
|
||||
default_text = '(default)' if type(default_compiler) is type(d.compiler) else set_text
|
||||
compilers_results.append(f"{colored('+', 'green')} {d._compiler_name(r)} {default_text}")
|
||||
set_text = f'({ctrl}={d._renderer_name(r)} to make default)' if (ctrl:=f"{device}_CC") in ContextVar._cache else ''
|
||||
default_text = '(default)' if type(default_renderer) is type(d.renderer) else set_text
|
||||
compilers_results.append(f"{colored('+', 'green')} {d._renderer_name(r)} {default_text}")
|
||||
any_works = True
|
||||
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._compiler_name(r)}: {e}")
|
||||
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._renderer_name(r)}: {e}")
|
||||
finally:
|
||||
# put the defaults back!
|
||||
d.comp_sets, d.comps_ctrl_var = default_comp_pairs, cc_ctrl_var
|
||||
d.renderers = default_renderers
|
||||
result = (colored('PASS', 'green') if any_works else f"{colored('FAIL', 'yellow')}") + ''.join([f'\n{" "*16} {x}' for x in compilers_results])
|
||||
except Exception as e:
|
||||
result = f"{colored('FAIL', 'red')} {e}"
|
||||
|
|
|
|||
|
|
@ -6,9 +6,10 @@ from dataclasses import dataclass
|
|||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filter_visible_devices
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, BufferSpec, CompilerSet
|
||||
from tinygrad.device import Compiled, BufferSpec
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
|
||||
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, AMD_HIPCC, ceildiv, unwrap
|
||||
from tinygrad.helpers import VIZ, ceildiv, unwrap
|
||||
from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, sqtt, amdgpu_kd, amdgpu_drm
|
||||
|
|
@ -967,11 +968,10 @@ class AMDDevice(HCQCompiled):
|
|||
self.sdma_queues:dict = {}
|
||||
self.has_sdma_queue = self.sdma_queue(0) is not None
|
||||
|
||||
compilers = CompilerSet([(functools.partial(AMDHIPRenderer, self.arch), None),
|
||||
(functools.partial(AMDLLVMRenderer, self.arch), AMD_LLVM),
|
||||
(functools.partial(AMDHIPCCRenderer, self.arch), AMD_HIPCC)], ctrl_var=AMD_CC)
|
||||
renderers:list[type[Renderer]|functools.partial] = [functools.partial(AMDHIPRenderer, self.arch), functools.partial(AMDLLVMRenderer, self.arch),
|
||||
functools.partial(AMDHIPCCRenderer, self.arch)]
|
||||
|
||||
super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
|
||||
super().__init__(device, AMDAllocator(self), renderers, functools.partial(AMDProgram, self), AMDSignal,
|
||||
functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),
|
||||
functools.partial(AMDCopyQueue, self, max_copy_size=self.max_copy_size) if self.has_sdma_queue else None,
|
||||
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20), sigalloc_size=0x100 if self.is_usb() else 0x1000,
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from tinygrad.runtime.autogen import opencl as cl
|
|||
from tinygrad.runtime.support import c
|
||||
from tinygrad.helpers import to_char_p_p, from_mv, OSX, DEBUG, mv_address, suppress_finalizing
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
|
||||
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError, CompilerSet
|
||||
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError
|
||||
from tinygrad.dtype import ImageDType
|
||||
|
||||
CC_CB = c.CFUNCTYPE[None, [c.POINTER[ctypes.c_char], c.POINTER[None], cl.size_t, c.POINTER[None]]]
|
||||
|
|
@ -119,7 +119,7 @@ class CLDevice(Compiled):
|
|||
|
||||
renderer = IntelRenderer if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts else OpenCLRenderer
|
||||
self.cl_compiler = CLCompiler(self, f"{hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()}")
|
||||
super().__init__(device, CLAllocator(self), CompilerSet([(renderer, None)]), functools.partial(CLProgram, self))
|
||||
super().__init__(device, CLAllocator(self), [renderer], functools.partial(CLProgram, self))
|
||||
|
||||
def synchronize(self):
|
||||
check(cl.clFinish(self.queue))
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
from __future__ import annotations
|
||||
import platform, sys, ctypes, functools, time, mmap, threading, queue
|
||||
from tinygrad.helpers import to_mv, OSX, WIN, mv_address, suppress_finalizing, unwrap, data64_le
|
||||
from tinygrad.helpers import CPU_CC, CPU_LVP, CPU_LLVM
|
||||
from tinygrad.device import BufferSpec, CompilerSet
|
||||
from tinygrad.device import BufferSpec
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
|
||||
from tinygrad.runtime.support.hcq import CLikeArgsState
|
||||
from tinygrad.renderer.cstyle import ClangJITRenderer
|
||||
|
|
@ -136,5 +136,5 @@ class CPUDevice(HCQCompiled):
|
|||
def __init__(self, device:str=""):
|
||||
self.tasks:queue.Queue = queue.Queue()
|
||||
CPUWorker(self, self.tasks, thread_id=0).start()
|
||||
compilers = CompilerSet([(ClangJITRenderer, None), (CPULLVMRenderer, CPU_LLVM), (LVPRenderer, CPU_LVP)], ctrl_var=CPU_CC)
|
||||
super().__init__(device, CPUAllocator(self), compilers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)
|
||||
renderers:list[type[Renderer]|functools.partial] = [ClangJITRenderer, CPULLVMRenderer, LVPRenderer]
|
||||
super().__init__(device, CPUAllocator(self), renderers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,8 @@
|
|||
from __future__ import annotations
|
||||
import ctypes, functools
|
||||
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing, CUDA_CC, CUDA_PTX, CUDA_NVCC
|
||||
from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerSet
|
||||
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing
|
||||
from tinygrad.device import Compiled, BufferSpec, LRUAllocator
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from tinygrad.runtime.autogen import cuda
|
||||
|
|
@ -118,10 +119,10 @@ class CUDADevice(Compiled):
|
|||
CUDADevice.devices.append(self)
|
||||
|
||||
from tinygrad.runtime.graph.cuda import CUDAGraph
|
||||
compilers = CompilerSet([(functools.partial(CUDARenderer, self.arch, device="CUDA"), None),
|
||||
(functools.partial(PTXRenderer, self.arch, device="CUDA"), CUDA_PTX),
|
||||
(functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True), CUDA_NVCC)], ctrl_var=CUDA_CC)
|
||||
super().__init__(device, CUDAAllocator(self), compilers, functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)
|
||||
renderers:list[type[Renderer]|functools.partial] = [functools.partial(CUDARenderer, self.arch, device="CUDA"),
|
||||
functools.partial(PTXRenderer, self.arch, device="CUDA"),
|
||||
functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True)]
|
||||
super().__init__(device, CUDAAllocator(self), renderers, functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)
|
||||
|
||||
def synchronize(self):
|
||||
check(cuda.cuCtxSetCurrent(self.context))
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ class DiskDevice(Compiled):
|
|||
self.size: int|None = None
|
||||
self.fd: int|None = None
|
||||
self.count = 0
|
||||
super().__init__(device, DiskAllocator(self), None, None)
|
||||
super().__init__(device, DiskAllocator(self), [], None)
|
||||
def _might_open(self, size:int):
|
||||
assert self.size is None or size <= self.size, f"can't reopen Disk tensor with larger size, opened with {self.size}, tried to open with {size}"
|
||||
if self.size is not None and hasattr(self, "mem"):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
|
||||
assert sys.platform != 'win32'
|
||||
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, CompilerSet
|
||||
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler
|
||||
from tinygrad.dtype import dtypes, DType, PtrDType
|
||||
from tinygrad.uop.ops import Ops, UOp
|
||||
from tinygrad.helpers import getenv, round_up, mv_address, to_mv, cpu_objdump, system, DEBUG, suppress_finalizing
|
||||
|
|
@ -146,10 +146,10 @@ class DSPCompiler(Compiler):
|
|||
|
||||
class DSPDevice(Compiled):
|
||||
def __init__(self, device:str=""):
|
||||
if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), CompilerSet([(MockDSPRenderer, None)]), MockDSPProgram)
|
||||
if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), [MockDSPRenderer], MockDSPProgram)
|
||||
else:
|
||||
self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
|
||||
super().__init__(device, DSPAllocator(self), CompilerSet([(DSPRenderer, None)]), functools.partial(DSPProgram, self))
|
||||
super().__init__(device, DSPAllocator(self), [DSPRenderer], functools.partial(DSPProgram, self))
|
||||
fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
|
||||
self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferSpec(nolru=True))
|
||||
ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import ctypes, functools
|
||||
from tinygrad.helpers import mv_address, getenv, suppress_finalizing
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferSpec
|
||||
from tinygrad.runtime.autogen import hip
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.runtime.support.c import init_c_var, init_c_struct_t
|
||||
|
|
@ -15,8 +15,7 @@ class HIPDevice(Compiled):
|
|||
self.arch = init_c_var(hip.hipDeviceProp_t, lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
|
||||
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t, lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
||||
|
||||
compilers = CompilerSet([(functools.partial(HIPRenderer, self.arch), None)])
|
||||
super().__init__(device, HIPAllocator(self), compilers, functools.partial(HIPProgram, self))
|
||||
super().__init__(device, HIPAllocator(self), [functools.partial(HIPRenderer, self.arch)], functools.partial(HIPProgram, self))
|
||||
def synchronize(self):
|
||||
check(hip.hipSetDevice(self.device_id))
|
||||
check(hip.hipDeviceSynchronize())
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import subprocess, pathlib, struct, ctypes, tempfile, functools, decimal, platform
|
||||
from tinygrad.helpers import prod, to_mv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
|
||||
import tinygrad.runtime.support.objc as objc
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent
|
||||
from tinygrad.renderer.cstyle import MetalRenderer
|
||||
from tinygrad.runtime.autogen import metal
|
||||
from tinygrad.runtime.support.c import DLL
|
||||
|
|
@ -42,7 +42,7 @@ class MetalDevice(Compiled):
|
|||
from tinygrad.runtime.graph.metal import MetalGraph
|
||||
# NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
|
||||
# This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
|
||||
super().__init__(device, MetalAllocator(self), CompilerSet([(MetalRenderer, None)]),
|
||||
super().__init__(device, MetalAllocator(self), [MetalRenderer],
|
||||
functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(self.sysdevice.name()).lower() else None)
|
||||
|
||||
def synchronize(self):
|
||||
|
|
|
|||
|
|
@ -8,4 +8,4 @@ class NpyAllocator(Allocator['NpyDevice']):
|
|||
def _copyout(self, dest:memoryview, src:np.ndarray): dest[:] = self._as_buffer(src)
|
||||
|
||||
class NpyDevice(Compiled):
|
||||
def __init__(self, device:str): super().__init__(device, NpyAllocator(self), None, None)
|
||||
def __init__(self, device:str): super().__init__(device, NpyAllocator(self), [], None)
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
import functools
|
||||
from tinygrad.device import Compiled, Allocator, CompilerSet
|
||||
from tinygrad.device import Compiled, Allocator
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer, QCOMCLRenderer
|
||||
from tinygrad.uop.ops import Ops
|
||||
from tinygrad.helpers import cpu_profile, EMULATE, NULL_QCOMCL, NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT
|
||||
from tinygrad.helpers import cpu_profile, EMULATE, NULL_ALLOW_COPYOUT
|
||||
from tinygrad.renderer.nir import IR3Renderer, NAKRenderer
|
||||
|
||||
class NullRenderer(CStyleLanguage):
|
||||
|
|
@ -39,7 +39,7 @@ class NullDevice(Compiled):
|
|||
case "AMD_CDNA4": renderer = functools.partial(AMDHIPRenderer, "gfx950")
|
||||
case "": renderer = NullRenderer
|
||||
case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
|
||||
compilers = CompilerSet([(renderer, None), (functools.partial(QCOMCLRenderer, 0x6030001), NULL_QCOMCL), # adreno 630
|
||||
(functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
|
||||
(functools.partial(NAKRenderer, "sm_120", 48), NULL_NAK)]) # 5090
|
||||
super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph)
|
||||
# adreno 630, 5090
|
||||
renderers:list[type[Renderer]|functools.partial] = [renderer, functools.partial(QCOMCLRenderer, 0x6030001),
|
||||
functools.partial(IR3Renderer, 0x6030001), functools.partial(NAKRenderer, "sm_120", 48)]
|
||||
super().__init__(device, NullAllocator(self), renderers, functools.partial(NullProgram, device), NullGraph)
|
||||
|
|
|
|||
|
|
@ -6,9 +6,9 @@ from dataclasses import dataclass
|
|||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, BufferSpec, CompilerSet
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, hi32, lo32, NV_CC, NV_PTX, NV_NAK, NV_NVCC, PROFILE
|
||||
from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
|
||||
from tinygrad.device import Compiled, BufferSpec
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, hi32, lo32, PROFILE, ContextVar, VIZ, ProfileEvent
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.runtime.autogen import nv_570, nv_580, mesa
|
||||
|
|
@ -618,11 +618,11 @@ class NVDevice(HCQCompiled[NVSignal]):
|
|||
self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
|
||||
self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)
|
||||
|
||||
compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
|
||||
(functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
|
||||
(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK),
|
||||
(functools.partial(CUDARenderer, self.arch, use_nvcc=True), NV_NVCC)])
|
||||
super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
|
||||
renderers:list[type[Renderer]|functools.partial] = [
|
||||
functools.partial(CUDARenderer, self.arch), functools.partial(PTXRenderer, self.arch, device="NV"),
|
||||
functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), functools.partial(CUDARenderer, self.arch, use_nvcc=True)
|
||||
]
|
||||
super().__init__(device, NVAllocator(self), renderers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
|
||||
|
||||
self.pma_enabled = PMA.value > 0 and PROFILE >= 1
|
||||
if self.pma_enabled: self._prof_init()
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from typing import Any, TYPE_CHECKING
|
|||
import pickle, base64, itertools, time, sys, functools
|
||||
from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate, storage_fmt_for_dtype, to_storage_scalar, from_storage_scalar
|
||||
from tinygrad.helpers import all_same, getenv, flatten, get_single_element, EMULATE
|
||||
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet
|
||||
from tinygrad.device import Compiled, Compiler, Allocator
|
||||
from tinygrad.codegen.opt import tc
|
||||
from tinygrad.uop.ops import exec_alu, python_alu, Ops, UOp, GroupOp, bitcast
|
||||
from tinygrad.renderer import Renderer
|
||||
|
|
@ -231,4 +231,4 @@ class PythonAllocator(Allocator['PythonDevice']):
|
|||
|
||||
class PythonDevice(Compiled):
|
||||
def __init__(self, device:str):
|
||||
super().__init__(device, PythonAllocator(self), CompilerSet([(PythonRenderer, None)]), PythonProgram)
|
||||
super().__init__(device, PythonAllocator(self), [PythonRenderer], PythonProgram)
|
||||
|
|
|
|||
|
|
@ -2,14 +2,14 @@ from __future__ import annotations
|
|||
import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Any, cast
|
||||
from tinygrad.device import BufferSpec, CompilerSet, Device
|
||||
from tinygrad.device import BufferSpec, Device
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
|
||||
from tinygrad.runtime.autogen import kgsl, mesa
|
||||
from tinygrad.renderer.cstyle import QCOMCLRenderer
|
||||
from tinygrad.renderer.nir import IR3Renderer
|
||||
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, cpu_profile, lo32, suppress_finalizing
|
||||
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE
|
||||
from tinygrad.helpers import next_power2, flatten, PROFILE
|
||||
from tinygrad.dtype import ImageDType, dtypes
|
||||
from tinygrad.runtime.support.system import System
|
||||
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
|
@ -378,10 +378,8 @@ class QCOMDevice(HCQCompiled):
|
|||
if PROFILE and self.gpu_id[:2] < (7, 3):
|
||||
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
|
||||
|
||||
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(functools.partial(QCOMCLRenderer, info.chip_id), None),
|
||||
(functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
|
||||
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
|
||||
functools.partial(QCOMComputeQueue, self), None)
|
||||
super().__init__(device, QCOMAllocator(self), [functools.partial(QCOMCLRenderer, info.chip_id), functools.partial(IR3Renderer, info.chip_id)],
|
||||
functools.partial(QCOMProgram, self), QCOMSignal, functools.partial(QCOMComputeQueue, self), None)
|
||||
|
||||
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
|
||||
flags |= flag("KGSL_MEMALIGN", alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ TINYFS_TIMEOUT = getenv("TINYFS_TIMEOUT", 60)
|
|||
class TinyFSDevice(Compiled):
|
||||
def __init__(self, device:str):
|
||||
self.op = device[len("tinyfs:"):].upper()
|
||||
super().__init__(device, TinyFSAllocator(self), None, None, None)
|
||||
super().__init__(device, TinyFSAllocator(self), [], None, None)
|
||||
|
||||
self.sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
||||
self.sock.connect((TINYFS_ENDPOINT.rsplit(":", 1)[0], int(TINYFS_ENDPOINT.rsplit(":", 1)[1])))
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import functools, struct
|
||||
from tinygrad.device import Compiled, Allocator, BufferSpec, CompilerSet
|
||||
from tinygrad.device import Compiled, Allocator, BufferSpec
|
||||
from tinygrad.renderer.wgsl import WGSLRenderer
|
||||
from tinygrad.helpers import round_up, suppress_finalizing
|
||||
from tinygrad.runtime.autogen import webgpu
|
||||
|
|
@ -217,7 +217,7 @@ class WebGpuDevice(Compiled):
|
|||
self.device_res = _run(webgpu.wgpuAdapterRequestDeviceF, webgpu.WGPURequestDeviceCallbackInfo, webgpu.WGPURequestDeviceCallback,
|
||||
webgpu.WGPURequestDeviceStatus, 1, 2, adapter_res, dev_desc)
|
||||
|
||||
super().__init__(device, WebGpuAllocator(self), CompilerSet([(WGSLRenderer, None)]),
|
||||
super().__init__(device, WebGpuAllocator(self), [WGSLRenderer],
|
||||
functools.partial(WebGPUProgram, (self.device_res, webgpu.WGPUFeatureName_TimestampQuery in supported)))
|
||||
|
||||
def synchronize(self):
|
||||
|
|
|
|||
|
|
@ -5,10 +5,11 @@ try: import fcntl # windows misses that
|
|||
except ImportError: fcntl = None #type:ignore[assignment]
|
||||
from tinygrad.helpers import PROFILE, getenv, to_mv, from_mv, cpu_profile, ProfileRangeEvent, select_first_inited, unwrap, suppress_finalizing
|
||||
from tinygrad.helpers import TracingKey
|
||||
from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent, CompilerSet
|
||||
from tinygrad.device import BufferSpec, Compiled, LRUAllocator, ProfileDeviceEvent, ProfileProgramEvent
|
||||
from tinygrad.uop.ops import sym_infer, sint, UOp
|
||||
from tinygrad.runtime.autogen import libc
|
||||
from tinygrad.runtime.support.memory import BumpAllocator
|
||||
from tinygrad.renderer import Renderer
|
||||
|
||||
class MMIOInterface:
|
||||
def __init__(self, addr:int, nbytes:int, fmt='B'): self.mv, self.addr, self.nbytes, self.fmt = to_mv(addr, nbytes).cast(fmt), addr, nbytes, fmt
|
||||
|
|
@ -361,7 +362,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
|||
signal_pool: dict[str, list[HCQBuffer]] = collections.defaultdict(list) # per peer group
|
||||
cpu_devices: list[HCQCompiled] = []
|
||||
|
||||
def __init__(self, device:str, allocator:HCQAllocatorBase, compilers:CompilerSet, runtime, signal_t:Type[SignalType],
|
||||
def __init__(self, device:str, allocator:HCQAllocatorBase, compilers:list[type[Renderer]|functools.partial], runtime, signal_t:Type[SignalType],
|
||||
comp_queue_t:Callable[..., HWQueue], copy_queue_t:Callable[..., HWQueue]|None=None, kernargs_size=(16 << 20), sigalloc_size=0x1000,
|
||||
can_recover:bool=False):
|
||||
self.device_id:int = int(device.split(":")[1]) if ":" in device else 0
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue