mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
remove CompilerPair (#14638)
This commit is contained in:
parent
396e1320fb
commit
e6562a5061
21 changed files with 112 additions and 147 deletions
|
|
@ -1,5 +1,5 @@
|
|||
import unittest
|
||||
from tinygrad.device import CompileError, Device, Compiler
|
||||
from tinygrad.device import CompileError, Device
|
||||
if Device.DEFAULT=="METAL":
|
||||
from tinygrad.runtime.ops_metal import MetalDevice, MetalCompiler, MetalProgram
|
||||
@unittest.skipIf(Device.DEFAULT!="METAL", "Metal support required")
|
||||
|
|
@ -48,28 +48,4 @@ kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgr
|
|||
""")
|
||||
with self.assertRaises(RuntimeError):
|
||||
compiled = compiled[:40] # corrupt the compiled program
|
||||
MetalProgram(device, "r_5", compiled)
|
||||
|
||||
def test_program_w_empty_compiler(self):
|
||||
device = MetalDevice("metal")
|
||||
compiler = Compiler(device)
|
||||
compiled = compiler.compile("""
|
||||
#include <metal_stdlib>
|
||||
kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]){
|
||||
data0[0] = 0;
|
||||
}
|
||||
""")
|
||||
MetalProgram(device, "r_5", compiled)
|
||||
|
||||
def test_bad_program_w_empty_compiler(self):
|
||||
device = MetalDevice("metal")
|
||||
compiler = Compiler(device)
|
||||
# this does not raise
|
||||
compiled = compiler.compile("""
|
||||
#include <metal_stdlib>
|
||||
kernel void r_5(device int* data0, const device int* data1, uint3 gid [[threadgroup_position_in_grid]], uint3 lid [[thread_position_in_threadgroup]]){
|
||||
invalid codes;
|
||||
}
|
||||
""")
|
||||
with self.assertRaises(RuntimeError):
|
||||
MetalProgram(device, "r_5", compiled)
|
||||
|
|
@ -1,9 +1,19 @@
|
|||
import unittest
|
||||
from unittest.mock import patch
|
||||
from tinygrad import Device
|
||||
from tinygrad.device import Buffer
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
|
||||
class TestCLCompileCache(unittest.TestCase):
|
||||
def test_compile_cached(self):
|
||||
device = Device[Device.DEFAULT]
|
||||
src = "__kernel void cached_test(__global int* a) { a[0] = 1; }"
|
||||
CLProgram(device, name="cached_test", lib=src.encode())
|
||||
with patch.object(CLCompiler, 'compile', side_effect=RuntimeError("compile should not be called on cache hit")):
|
||||
CLProgram(device, name="cached_test", lib=src.encode())
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
|
||||
class TestCLError(unittest.TestCase):
|
||||
@unittest.skip("allocates tons of memory")
|
||||
|
|
@ -17,7 +27,7 @@ class TestCLError(unittest.TestCase):
|
|||
def test_invalid_kernel_name(self):
|
||||
device = Device[Device.DEFAULT]
|
||||
with self.assertRaises(RuntimeError) as err:
|
||||
CLProgram(device, name="", lib=CLCompiler(device, "test").compile("__kernel void test(__global int* a) { a[0] = 1; }"))
|
||||
CLProgram(device, name="", lib="__kernel void test(__global int* a) { a[0] = 1; }".encode())
|
||||
assert str(err.exception) == "OpenCL Error -46: CL_INVALID_KERNEL_NAME"
|
||||
|
||||
def test_unaligned_copy(self):
|
||||
|
|
|
|||
|
|
@ -139,7 +139,6 @@ def do_render(ctx:Renderer, prg:UOp, lin:UOp) -> UOp:
|
|||
return prg.replace(src=prg.src + (UOp(Ops.SOURCE, arg=src),), arg=ctx.aux(list(lin.src)) if ctx.has_aux else prg.arg)
|
||||
|
||||
def do_compile(ctx:Renderer, prg:UOp, source:UOp) -> UOp|None:
|
||||
if ctx.compiler is None: return None
|
||||
lib = ctx.compiler.compile_cached(source.arg)
|
||||
return prg.replace(src=prg.src + (UOp(Ops.BINARY, arg=lib),))
|
||||
|
||||
|
|
|
|||
|
|
@ -1,14 +1,14 @@
|
|||
from __future__ import annotations
|
||||
from dataclasses import dataclass, replace
|
||||
from collections import defaultdict
|
||||
from typing import Any, Generic, TypeVar, Iterator, Generator
|
||||
from typing import Any, Generic, TypeVar, Iterator, Generator, TYPE_CHECKING
|
||||
import importlib, inspect, functools, pathlib, os, platform, contextlib, sys, re, atexit, pickle, decimal
|
||||
from tinygrad.helpers import CI, OSX, LRU, getenv, diskcache_get, diskcache_put, DEBUG, GlobalCounters, flat_mv, PROFILE, temp, colored
|
||||
from tinygrad.helpers import Context, CCACHE, ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE, cpu_events, ProfileEvent, ProfilePointEvent, dedup, ContextVar
|
||||
from tinygrad.helpers import unwrap_class_type, suppress_finalizing, select_first_inited, VIZ, CPU_LLVM, CPU_LVP, NV_PTX, CUDA_PTX, NV_NAK
|
||||
from tinygrad.helpers import EMULATED_DTYPES
|
||||
from tinygrad.dtype import DType, ImageDType, PtrDType, dtypes, _to_np_dtype
|
||||
from tinygrad.renderer import Renderer
|
||||
if TYPE_CHECKING: from tinygrad.renderer import Renderer
|
||||
|
||||
# **************** Device ****************
|
||||
|
||||
|
|
@ -278,39 +278,34 @@ class Compiler:
|
|||
def disassemble(self, lib:bytes): pass
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CompilerPair:
|
||||
renderer:type[Renderer]|functools.partial; compiler:type[Compiler]|functools.partial|None = None; ctrl_var:ContextVar|None = None # noqa: E702
|
||||
name:str|None = None
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class CompilerSet: cset:list[CompilerPair]; ctrl_var:ContextVar|None = None # noqa: E702
|
||||
class CompilerSet: cset:list[tuple[type[Renderer]|functools.partial, ContextVar|None]]; ctrl_var:ContextVar|None = None # noqa: E702
|
||||
|
||||
class Compiled:
|
||||
profile_events:list[ProfileEvent] = [ProfileDeviceEvent("CPU")] # NOTE: CPU is the default device.
|
||||
|
||||
def __init__(self, device:str, allocator:Allocator, compilers:CompilerSet|None, runtime, graph=None, group_id=None):
|
||||
from tinygrad.renderer import Renderer
|
||||
|
||||
self.device, self.allocator, self.runtime, self.graph, self.group_id = device, allocator, runtime, graph, group_id
|
||||
|
||||
self.comps_ctrl_var = compilers.ctrl_var if compilers is not None else None
|
||||
self.comp_sets:dict[Any, tuple[ContextVar|None, tuple[type[Renderer]|functools.partial, type[Compiler]|functools.partial|None]]] = {}
|
||||
self.cached_pair:dict[Any, tuple[Renderer, Compiler|None]] = {}
|
||||
for cpair in (compilers.cset if compilers is not None else [CompilerPair(Renderer, Compiler)]):
|
||||
self.comp_sets[cpair.name or self._compiler_name(cpair.renderer, cpair.compiler)] = (cpair.ctrl_var, (cpair.renderer, cpair.compiler))
|
||||
self.comp_sets:dict[str, tuple[ContextVar|None, type[Renderer]|functools.partial]] = {}
|
||||
self.cached_pair:dict[Any, Renderer] = {}
|
||||
for ren, var in (compilers.cset if compilers is not None else [(Renderer, None)]):
|
||||
self.comp_sets[var.key.split('_', 1)[-1] if var is not None else self._compiler_name(ren)] = (var, ren)
|
||||
|
||||
@property
|
||||
def renderer(self) -> Renderer: return self._select_compiler_pair()[0]
|
||||
def renderer(self) -> Renderer: return self._select_compiler_pair()
|
||||
|
||||
@property
|
||||
def compiler(self) -> Compiler:
|
||||
if (ret:=self.renderer.compiler or self._select_compiler_pair()[1]) is None: raise RuntimeError(f"no compiler for {self.device}")
|
||||
if (ret:=self.renderer.compiler) is None: raise RuntimeError(f"no compiler for {self.device}")
|
||||
return ret
|
||||
|
||||
def _compiler_name(self, r:type[Renderer]|functools.partial, c:type[Compiler]|functools.partial|None) -> str:
|
||||
devname = self.device.split(':')[0].upper()
|
||||
if c is None: return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname) or devname
|
||||
return unwrap_class_type(c).__name__.upper().removesuffix("COMPILER").removeprefix(devname) or devname
|
||||
def _compiler_name(self, r:type[Renderer]|functools.partial) -> str:
|
||||
return unwrap_class_type(r).__name__.upper().removesuffix("RENDERER").removeprefix(devname:=self.device.split(':')[0].upper()) or devname
|
||||
|
||||
def _select_compiler_pair(self) -> tuple[Renderer, Compiler|None]:
|
||||
def _select_compiler_pair(self) -> Renderer:
|
||||
# select forced compiler from global env var.
|
||||
forced_comps = set([self.comp_sets[val][1]] if self.comps_ctrl_var is not None and (val:=self.comps_ctrl_var.value) else [])
|
||||
|
||||
|
|
@ -399,18 +394,18 @@ def enumerate_devices_str() -> Generator[str, None, None]:
|
|||
d = Device[device]
|
||||
default_comp_pairs, default_compiler, cc_ctrl_var = d.comp_sets, d.compiler, d.comps_ctrl_var
|
||||
try:
|
||||
for k,(en,(r,c)) in default_comp_pairs.items():
|
||||
d.comp_sets = {k:(None,(r,c))} # env var set to None, so it doesn't interfere
|
||||
for k,(en,r) in default_comp_pairs.items():
|
||||
d.comp_sets = {k:(None,r)} # env var set to None, so it doesn't interfere
|
||||
d.comps_ctrl_var = None
|
||||
try:
|
||||
# d.renderer, d.compiler = r(), c()
|
||||
with Context(CACHELEVEL=0): test = (Tensor([1,2,3], device=device) * 2).tolist()
|
||||
if test != [2,4,6]: raise ValueError(f"got {test} instead of [2, 4, 6]")
|
||||
set_text = f'({cc_ctrl_var.key}={d._compiler_name(r, c)} to make default)' if cc_ctrl_var is not None else ''
|
||||
set_text = f'({cc_ctrl_var.key}={d._compiler_name(r)} to make default)' if cc_ctrl_var is not None else ''
|
||||
default_text = '(default)' if type(default_compiler) is type(d.compiler) else set_text
|
||||
compilers_results.append(f"{colored('+', 'green')} {d._compiler_name(r, c)} {default_text}")
|
||||
compilers_results.append(f"{colored('+', 'green')} {d._compiler_name(r)} {default_text}")
|
||||
any_works = True
|
||||
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._compiler_name(r, c)}: {e}")
|
||||
except Exception as e: compilers_results.append(f"{colored('-', 'yellow')} {d._compiler_name(r)}: {e}")
|
||||
finally:
|
||||
# put the defaults back!
|
||||
d.comp_sets, d.comps_ctrl_var = default_comp_pairs, cc_ctrl_var
|
||||
|
|
|
|||
|
|
@ -185,11 +185,12 @@ ALLOW_DEVICE_USAGE, MAX_BUFFER_SIZE = ContextVar("ALLOW_DEVICE_USAGE", 1), Conte
|
|||
EMULATE, EMULATED_DTYPES = ContextVar("EMULATE", ""), ContextVar("EMULATED_DTYPES", "")
|
||||
CPU_COUNT = ContextVar("CPU_COUNT", max(1, len(os.sched_getaffinity(0)) if hasattr(os, "sched_getaffinity") else (os.cpu_count() or 1)))
|
||||
# Compilers
|
||||
CPU_LLVM, CPU_LVP, AMD_LLVM = ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0), ContextVar("AMD_LLVM", 0)
|
||||
NV_PTX, CUDA_PTX, NV_NAK, QCOM_IR3 = ContextVar("NV_PTX", 0), ContextVar("CUDA_PTX", 0), ContextVar("NV_NAK", 0), ContextVar("QCOM_IR3", 0)
|
||||
CPU_CC, CPU_LLVM, CPU_LVP = ContextVar("CPU_CC", ""), ContextVar("CPU_LLVM", 0), ContextVar("CPU_LVP", 0)
|
||||
NV_CC, NV_PTX, NV_NAK = ContextVar("NV_CC", ""), ContextVar("NV_PTX", 0), ContextVar("NV_NAK", 0)
|
||||
CUDA_CC, CUDA_PTX, CUDA_NVCC = ContextVar("CUDA_CC", ""), ContextVar("CUDA_PTX", 0), ContextVar("CUDA_NVCC", 0)
|
||||
NULL_IR3, NULL_NAK, NULL_ALLOW_COPYOUT = ContextVar("NULL_IR3", 0), ContextVar("NULL_NAK", 0), ContextVar("NULL_ALLOW_COPYOUT", 0)
|
||||
AMD_CC, CPU_CC, NV_CC, CUDA_CC = ContextVar("AMD_CC", ""), ContextVar("CPU_CC", ""), ContextVar("NV_CC", ""), ContextVar("CUDA_CC", "")
|
||||
QCOM_CC = ContextVar("QCOM_CC", "")
|
||||
AMD_CC, AMD_LLVM, AMD_HIPCC = ContextVar("AMD_CC", ""), ContextVar("AMD_LLVM", 0), ContextVar("AMD_HIPCC", 0)
|
||||
QCOM_CC, QCOM_IR3 = ContextVar("QCOM_CC", ""), ContextVar("QCOM_IR3", 0)
|
||||
# VIZ implies PROFILE, but you can run PROFILE without VIZ
|
||||
VIZ = ContextVar("VIZ", 0)
|
||||
PROFILE = ContextVar("PROFILE", abs(VIZ.value))
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from __future__ import annotations
|
||||
from typing import Callable, cast, TYPE_CHECKING
|
||||
from typing import Callable, cast
|
||||
import functools
|
||||
from dataclasses import dataclass, field
|
||||
from tinygrad.helpers import to_function_name, dedup, prod, DEBUG
|
||||
|
|
@ -7,7 +7,7 @@ from tinygrad.uop.ops import Ops, UOp, sym_infer, sint, Variable, ssimplify, Gro
|
|||
from tinygrad.dtype import AddrSpace, PtrDType
|
||||
from tinygrad.codegen.opt.tc import TensorCore
|
||||
from tinygrad.codegen.opt import Opt
|
||||
if TYPE_CHECKING: from tinygrad.device import Compiler
|
||||
from tinygrad.device import Compiler
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Estimates:
|
||||
|
|
@ -150,7 +150,8 @@ class Renderer:
|
|||
pre_matcher: PatternMatcher|None = None
|
||||
extra_matcher: PatternMatcher|None = None
|
||||
code_for_op: dict[Ops, Callable] = {}
|
||||
compiler: Compiler|None = None
|
||||
|
||||
compiler: Compiler = Compiler()
|
||||
|
||||
def __reduce__(self): return self.__class__, ()
|
||||
def render(self, uops:list[UOp]) -> str: raise NotImplementedError("needs a renderer")
|
||||
|
|
|
|||
|
|
@ -3,10 +3,9 @@ import os, math, sys, struct
|
|||
from collections import defaultdict, Counter
|
||||
from tinygrad.codegen.opt import tc
|
||||
from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str, axis_letters
|
||||
from tinygrad.helpers import strip_parens, getenv, prod, dedup, select_first_inited, AMX, CPU_COUNT
|
||||
from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX, CPU_COUNT
|
||||
from tinygrad.dtype import ImageDType, dtypes, DType, PtrDType, AddrSpace, truncate, float_to_bf16
|
||||
from tinygrad.renderer import Renderer
|
||||
from tinygrad.device import Compiler
|
||||
from tinygrad.codegen.late.devectorizer import no_vectorized_alu
|
||||
|
||||
|
||||
|
|
@ -343,8 +342,7 @@ class MetalRenderer(CStyleLanguage):
|
|||
shared_max = 32768
|
||||
def __init__(self):
|
||||
from tinygrad.runtime.ops_metal import MetalCompiler
|
||||
self.compiler = select_first_inited([MetalCompiler, Compiler], "No compiler for METAL is available")
|
||||
self.tensor_cores = tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
|
||||
self.compiler, self.tensor_cores = MetalCompiler(), tc.metal if hasattr(os, 'uname') and os.uname().machine == "arm64" else []
|
||||
|
||||
# language options
|
||||
kernel_typedef = "kernel void"
|
||||
|
|
|
|||
|
|
@ -115,7 +115,7 @@ def __getattr__(nm):
|
|||
return load("rocprof", "['rocprof-trace-decoder', p:='/usr/local/lib/rocprof-trace-decoder.so', p.replace('so','dylib')]",
|
||||
[f"{{}}/include/{s}.h" for s in ["rocprof_trace_decoder", "trace_decoder_instrument", "trace_decoder_types"]],
|
||||
tarball="https://github.com/ROCm/rocprof-trace-decoder/archive/dd0485100971522cc4cd8ae136bdda431061a04d.tar.gz")
|
||||
case "mesa": return load("mesa", "['tinymesa_cpu', 'tinymesa']", [
|
||||
case "mesa": return load("mesa", "([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu']", [
|
||||
*[f"{{}}/src/compiler/nir/{s}.h" for s in ["nir", "nir_builder", "nir_shader_compiler_options", "nir_serialize"]], "{}/gen/nir_intrinsics.h",
|
||||
*[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
|
||||
*[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
|
||||
|
|
@ -134,7 +134,8 @@ def __getattr__(nm):
|
|||
*[f"python3 src/compiler/{s}_h.py > gen/{s.split('/')[-1]}.h" for s in ["nir/nir_opcodes", "nir/nir_builder_opcodes"]],
|
||||
*[f"python3 src/compiler/nir/nir_{s}_h.py --outdir gen" for s in ["intrinsics", "intrinsics_indices"]]]), cwd=path, shell=True, check=True),
|
||||
tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.7/mesa-25.2.7.tar.gz",
|
||||
prolog=["import gzip, base64"], epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
|
||||
prolog=["from tinygrad.helpers import CPU_CC, CPU_LVP", "import gzip, base64"],
|
||||
epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
|
||||
case "libclang":
|
||||
return load("libclang", clang_lib,
|
||||
lambda: [f"{system('llvm-config-20 --includedir')}/clang-c/{s}.h" for s in ["Index", "CXString", "CXSourceLocation", "CXFile"]],
|
||||
|
|
|
|||
|
|
@ -4,8 +4,9 @@ import ctypes
|
|||
from typing import Annotated, Literal, TypeAlias
|
||||
from tinygrad.runtime.support.c import _IO, _IOW, _IOR, _IOWR
|
||||
from tinygrad.runtime.support import c
|
||||
from tinygrad.helpers import CPU_CC, CPU_LVP
|
||||
import gzip, base64
|
||||
dll = c.DLL('mesa', ['tinymesa_cpu', 'tinymesa'])
|
||||
dll = c.DLL('mesa', ([] if CPU_CC.value == 'LVP' or bool(CPU_LVP) else ['tinymesa']) + ['tinymesa_cpu'])
|
||||
class struct_u_printf_info(ctypes.Structure): pass
|
||||
u_printf_info: TypeAlias = struct_u_printf_info
|
||||
uint32_t: TypeAlias = Annotated[int, ctypes.c_uint32]
|
||||
|
|
|
|||
|
|
@ -6,9 +6,9 @@ from dataclasses import dataclass
|
|||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, FileIOInterface
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, BumpAllocator, hcq_filter_visible_devices
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet, CompilerPair
|
||||
from tinygrad.device import Compiled, DMAFdRef, BufferSpec, CompilerSet
|
||||
from tinygrad.helpers import getenv, round_up, data64_le, DEBUG, PROFILE, ProfileEvent, lo32, hi32, colored, prod, ContextVar
|
||||
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, ceildiv, unwrap
|
||||
from tinygrad.helpers import VIZ, AMD_CC, AMD_LLVM, AMD_HIPCC, ceildiv, unwrap
|
||||
from tinygrad.renderer.cstyle import AMDHIPRenderer, AMDHIPCCRenderer
|
||||
from tinygrad.renderer.llvmir import AMDLLVMRenderer
|
||||
from tinygrad.runtime.autogen import kfd, hsa, pci, sqtt, amdgpu_kd, amdgpu_drm
|
||||
|
|
@ -962,9 +962,9 @@ class AMDDevice(HCQCompiled):
|
|||
self.sdma_queues:dict = {}
|
||||
self.has_sdma_queue = self.sdma_queue(0) is not None
|
||||
|
||||
compilers = CompilerSet([CompilerPair(functools.partial(AMDHIPRenderer, self.arch), None),
|
||||
CompilerPair(functools.partial(AMDLLVMRenderer, self.arch), None, AMD_LLVM),
|
||||
CompilerPair(functools.partial(AMDHIPCCRenderer, self.arch), None)], ctrl_var=AMD_CC)
|
||||
compilers = CompilerSet([(functools.partial(AMDHIPRenderer, self.arch), None),
|
||||
(functools.partial(AMDLLVMRenderer, self.arch), AMD_LLVM),
|
||||
(functools.partial(AMDHIPCCRenderer, self.arch), AMD_HIPCC)], ctrl_var=AMD_CC)
|
||||
|
||||
super().__init__(device, AMDAllocator(self), compilers, functools.partial(AMDProgram, self), AMDSignal,
|
||||
functools.partial(AMDComputeAQLQueue if self.is_aql else AMDComputeQueue, self),
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ from tinygrad.runtime.autogen import opencl as cl
|
|||
from tinygrad.runtime.support import c
|
||||
from tinygrad.helpers import to_char_p_p, from_mv, OSX, DEBUG, mv_address, suppress_finalizing
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
|
||||
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError, CompilerPair, CompilerSet
|
||||
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError, CompilerSet
|
||||
from tinygrad.dtype import ImageDType
|
||||
|
||||
CC_CB = c.CFUNCTYPE[None, [c.POINTER[ctypes.c_char], c.POINTER[None], cl.size_t, c.POINTER[None]]]
|
||||
|
|
@ -39,9 +39,9 @@ class CLCompiler(Compiler):
|
|||
|
||||
class CLProgram:
|
||||
def __init__(self, device:CLDevice, name:str, lib:bytes, buf_dtypes=[], **kwargs):
|
||||
self.dev, self.name, self.lib, self.buf_dtypes = device, name, lib, buf_dtypes
|
||||
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
|
||||
to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
|
||||
self.dev, self.name, self.lib, self.buf_dtypes = device, name, device.clc.compile_cached(lib.decode()), buf_dtypes
|
||||
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(self.lib)),
|
||||
to_char_p_p([self.lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
|
||||
errcode_ret := ctypes.c_int32()), errcode_ret)
|
||||
check(binary_status.value)
|
||||
check(cl.clBuildProgram(self.program, 1, device.device_id, None, BP_CB(), None)) # NOTE: OSX requires this
|
||||
|
|
@ -125,8 +125,8 @@ class CLDevice(Compiled):
|
|||
ctypes.string_at(buf, size=total.value).decode())[1]
|
||||
|
||||
renderer = IntelRenderer if "cl_intel_subgroup_matrix_multiply_accumulate" in self.device_exts else OpenCLRenderer
|
||||
compiler = functools.partial(CLCompiler, self, f"{hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()}")
|
||||
super().__init__(device, CLAllocator(self), CompilerSet([CompilerPair(renderer, compiler)]), functools.partial(CLProgram, self))
|
||||
self.clc = CLCompiler(self, f"{hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()}")
|
||||
super().__init__(device, CLAllocator(self), CompilerSet([(renderer, None)]), functools.partial(CLProgram, self))
|
||||
def synchronize(self):
|
||||
check(cl.clFinish(self.queue))
|
||||
self.pending_copyin.clear()
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ from __future__ import annotations
|
|||
import platform, sys, ctypes, functools, time, mmap, threading, queue
|
||||
from tinygrad.helpers import to_mv, OSX, WIN, mv_address, wait_cond, suppress_finalizing, unwrap, data64_le
|
||||
from tinygrad.helpers import CPU_CC, CPU_LVP, CPU_LLVM
|
||||
from tinygrad.device import BufferSpec, DMACPURef, CompilerSet, CompilerPair
|
||||
from tinygrad.device import BufferSpec, DMACPURef, CompilerSet
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, HCQArgsState, HCQSignal, HCQProgram, MMIOInterface
|
||||
from tinygrad.runtime.support.hcq import CLikeArgsState
|
||||
from tinygrad.renderer.cstyle import ClangJITRenderer
|
||||
|
|
@ -133,6 +133,5 @@ class CPUDevice(HCQCompiled):
|
|||
def __init__(self, device:str=""):
|
||||
self.tasks:queue.Queue = queue.Queue()
|
||||
CPUWorker(self, self.tasks, thread_id=0).start()
|
||||
compilers = CompilerSet([CompilerPair(ClangJITRenderer, None), CompilerPair(CPULLVMRenderer, None, ctrl_var=CPU_LLVM),
|
||||
CompilerPair(LVPRenderer, None, ctrl_var=CPU_LVP)], ctrl_var=CPU_CC)
|
||||
compilers = CompilerSet([(ClangJITRenderer, None), (CPULLVMRenderer, CPU_LLVM), (LVPRenderer, CPU_LVP)], ctrl_var=CPU_CC)
|
||||
super().__init__(device, CPUAllocator(self), compilers, functools.partial(CPUProgram, self), CPUSignal, CPUComputeQueue)
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
import ctypes, functools
|
||||
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing, CUDA_CC, CUDA_PTX
|
||||
from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerPair, CompilerSet
|
||||
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing, CUDA_CC, CUDA_PTX, CUDA_NVCC
|
||||
from tinygrad.device import Compiled, BufferSpec, LRUAllocator, CompilerSet
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from tinygrad.runtime.autogen import cuda
|
||||
|
|
@ -118,9 +118,9 @@ class CUDADevice(Compiled):
|
|||
CUDADevice.devices.append(self)
|
||||
|
||||
from tinygrad.runtime.graph.cuda import CUDAGraph
|
||||
compilers = CompilerSet([CompilerPair(functools.partial(CUDARenderer, self.arch, device="CUDA")),
|
||||
CompilerPair(functools.partial(PTXRenderer, self.arch, device="CUDA"), ctrl_var=CUDA_PTX),
|
||||
CompilerPair(functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True), name="NVCC")], ctrl_var=CUDA_CC)
|
||||
compilers = CompilerSet([(functools.partial(CUDARenderer, self.arch, device="CUDA"), None),
|
||||
(functools.partial(PTXRenderer, self.arch, device="CUDA"), CUDA_PTX),
|
||||
(functools.partial(CUDARenderer, self.arch, device="CUDA", use_nvcc=True), CUDA_NVCC)], ctrl_var=CUDA_CC)
|
||||
super().__init__(device, CUDAAllocator(self), compilers, functools.partial(CUDAProgram, self), None if MOCKGPU else CUDAGraph)
|
||||
|
||||
def synchronize(self):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
|
||||
assert sys.platform != 'win32'
|
||||
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, CompilerSet, CompilerPair
|
||||
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, CompilerSet
|
||||
from tinygrad.dtype import dtypes, DType, PtrDType
|
||||
from tinygrad.uop.ops import Ops, UOp
|
||||
from tinygrad.helpers import getenv, round_up, mv_address, to_mv, cpu_objdump, system, DEBUG, suppress_finalizing
|
||||
|
|
@ -146,10 +146,10 @@ class DSPCompiler(Compiler):
|
|||
|
||||
class DSPDevice(Compiled):
|
||||
def __init__(self, device:str=""):
|
||||
if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), CompilerSet([CompilerPair(MockDSPRenderer)]), MockDSPProgram)
|
||||
if getenv("MOCKDSP"): super().__init__(device, DSPAllocator(self), CompilerSet([(MockDSPRenderer, None)]), MockDSPProgram)
|
||||
else:
|
||||
self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
|
||||
super().__init__(device, DSPAllocator(self), CompilerSet([CompilerPair(DSPRenderer)]), functools.partial(DSPProgram, self))
|
||||
super().__init__(device, DSPAllocator(self), CompilerSet([(DSPRenderer, None)]), functools.partial(DSPProgram, self))
|
||||
fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
|
||||
self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferSpec(nolru=True))
|
||||
ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import ctypes, functools
|
||||
from tinygrad.helpers import mv_address, getenv, suppress_finalizing
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet, CompilerPair
|
||||
from tinygrad.device import Compiled, LRUAllocator, BufferSpec, CompilerSet
|
||||
from tinygrad.runtime.autogen import hip
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.runtime.support.c import init_c_var, init_c_struct_t
|
||||
|
|
@ -15,7 +15,7 @@ class HIPDevice(Compiled):
|
|||
self.arch = init_c_var(hip.hipDeviceProp_t, lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
|
||||
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t, lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
||||
|
||||
compilers = CompilerSet([CompilerPair(functools.partial(HIPRenderer, self.arch), None)])
|
||||
compilers = CompilerSet([(functools.partial(HIPRenderer, self.arch), None)])
|
||||
super().__init__(device, HIPAllocator(self), compilers, functools.partial(HIPProgram, self))
|
||||
def synchronize(self):
|
||||
check(hip.hipSetDevice(self.device_id))
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import subprocess, pathlib, struct, ctypes, tempfile, functools, contextlib, decimal, platform
|
||||
from tinygrad.helpers import prod, to_mv, getenv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
|
||||
import subprocess, pathlib, struct, ctypes, tempfile, functools, decimal, platform
|
||||
from tinygrad.helpers import prod, to_mv, round_up, cache_dir, PROFILE, ProfileRangeEvent, cpu_profile, unwrap, suppress_finalizing
|
||||
import tinygrad.runtime.support.objc as objc
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet, CompilerPair
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, ProfileDeviceEvent, CompilerSet
|
||||
from tinygrad.renderer.cstyle import MetalRenderer
|
||||
from tinygrad.runtime.autogen import metal
|
||||
from tinygrad.runtime.support.c import DLL
|
||||
|
|
@ -42,7 +42,7 @@ class MetalDevice(Compiled):
|
|||
from tinygrad.runtime.graph.metal import MetalGraph
|
||||
# NOTE: GitHub CI macOS runners use paravirtualized metal which is broken with graph.
|
||||
# This can be reproduced locally with any virtualization software (like utm) that can create macOS VMs with apple's own virtualization framework.
|
||||
super().__init__(device, MetalAllocator(self), CompilerSet([CompilerPair(MetalRenderer, None)]),
|
||||
super().__init__(device, MetalAllocator(self), CompilerSet([(MetalRenderer, None)]),
|
||||
functools.partial(MetalProgram, self), MetalGraph if 'virtual' not in from_ns_str(self.sysdevice.name()).lower() else None)
|
||||
|
||||
def synchronize(self):
|
||||
|
|
@ -54,20 +54,12 @@ class MetalDevice(Compiled):
|
|||
Compiled.profile_events += [ProfileRangeEvent(self.device, lb, st, en)]
|
||||
self.mtl_buffers_in_flight.clear()
|
||||
|
||||
def metal_src_to_library(device:MetalDevice, src:str) -> metal.MTLLibrary:
|
||||
options = metal.MTLCompileOptions.new()
|
||||
options.setFastMathEnabled(getenv("METAL_FAST_MATH"))
|
||||
library = device.sysdevice.newLibraryWithSource_options_error(to_ns_str(src), options, ctypes.byref(compileError:=metal.NSError().retained()))
|
||||
error_check(compileError, CompileError)
|
||||
return library
|
||||
|
||||
class MetalCompiler(Compiler):
|
||||
# Opening METAL after LLVM doesn't fail because ctypes.CDLL opens with RTLD_LOCAL but MTLCompiler opens it's own llvm with RTLD_GLOBAL
|
||||
# This means that MTLCompiler's llvm will create it's own instances of global state because RTLD_LOCAL doesn't export symbols, but if RTLD_GLOBAL
|
||||
# library is loaded first then RTLD_LOCAL library will just use it's symbols. On linux there is RTLD_DEEPBIND to prevent that, but on macos there
|
||||
# doesn't seem to be anything we can do.
|
||||
with contextlib.suppress(FileNotFoundError, ModuleNotFoundError):
|
||||
import tinygrad.runtime.autogen.llvm # noqa: F401
|
||||
import tinygrad.runtime.autogen.llvm as _
|
||||
support = DLL("MTLCompiler", "MTLCompiler")
|
||||
support.MTLCodeGenServiceCreate.restype = ctypes.c_void_p
|
||||
|
||||
|
|
@ -118,15 +110,9 @@ class MetalCompiler(Compiler):
|
|||
class MetalProgram:
|
||||
def __init__(self, dev:MetalDevice, name:str, lib:bytes, **kwargs):
|
||||
self.dev, self.name, self.lib = dev, name, lib
|
||||
if lib[:4] == b"MTLB":
|
||||
# binary metal library
|
||||
data = objc.dispatch_data_create(lib, len(lib), None, None)
|
||||
self.library = self.dev.sysdevice.newLibraryWithData_error(data, ctypes.byref(error_lib:=metal.NSError().retained())).retained()
|
||||
error_check(error_lib)
|
||||
else:
|
||||
# metal source. rely on OS caching
|
||||
try: self.library = metal_src_to_library(self.dev, lib.decode())
|
||||
except CompileError as e: raise RuntimeError from e
|
||||
data = objc.dispatch_data_create(lib, len(lib), None, None)
|
||||
self.library = self.dev.sysdevice.newLibraryWithData_error(data, ctypes.byref(error_lib:=metal.NSError().retained())).retained()
|
||||
error_check(error_lib)
|
||||
self.fxn = self.library.newFunctionWithName(to_ns_str(name)).retained()
|
||||
descriptor = metal.MTLComputePipelineDescriptor.new()
|
||||
descriptor.setComputeFunction(self.fxn)
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import functools
|
||||
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
|
||||
from tinygrad.device import Compiled, Allocator, CompilerSet
|
||||
from tinygrad.engine.jit import MultiGraphRunner
|
||||
from tinygrad.renderer.cstyle import Renderer, CStyleLanguage, AMDHIPRenderer
|
||||
from tinygrad.uop.ops import Ops
|
||||
|
|
@ -39,6 +39,6 @@ class NullDevice(Compiled):
|
|||
case "AMD_CDNA4": renderer = functools.partial(AMDHIPRenderer, "gfx950")
|
||||
case "": renderer = NullRenderer
|
||||
case _: raise RuntimeError(f"can't EMULATE device: {EMULATE.value}")
|
||||
compilers = CompilerSet([CompilerPair(renderer, Compiler), CompilerPair(functools.partial(IR3Renderer, 0x6030001), None, NULL_IR3), # adreno 630
|
||||
CompilerPair(functools.partial(NAKRenderer, "sm_120", 48), None, NULL_NAK)]) # 5090
|
||||
compilers = CompilerSet([(renderer, None), (functools.partial(IR3Renderer, 0x6030001), NULL_IR3), # adreno 630
|
||||
(functools.partial(NAKRenderer, "sm_120", 48), NULL_NAK)]) # 5090
|
||||
super().__init__(device, NullAllocator(self), compilers, functools.partial(NullProgram, device), NullGraph)
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from dataclasses import dataclass
|
|||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, BufferSpec, CompilerPair, CompilerSet
|
||||
from tinygrad.device import Compiled, BufferSpec, CompilerSet
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, to_mv, hi32, lo32, NV_CC, NV_PTX, NV_NAK, PROFILE
|
||||
from tinygrad.helpers import ContextVar, VIZ, ProfileEvent
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
|
|
@ -619,9 +619,9 @@ class NVDevice(HCQCompiled[NVSignal]):
|
|||
self.arch: str = "sm_120" if self.sm_version==0xa04 else f"sm_{(self.sm_version>>8)&0xff}{(val>>4) if (val:=self.sm_version&0xff) > 0xf else val}"
|
||||
self.sass_version = ((self.sm_version & 0xf00) >> 4) | (self.sm_version & 0xf)
|
||||
|
||||
compilers = CompilerSet(ctrl_var=NV_CC, cset=[CompilerPair(functools.partial(CUDARenderer, self.arch)),
|
||||
CompilerPair(functools.partial(PTXRenderer, self.arch, device="NV"), ctrl_var=NV_PTX),
|
||||
CompilerPair(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), ctrl_var=NV_NAK)])
|
||||
compilers = CompilerSet(ctrl_var=NV_CC, cset=[(functools.partial(CUDARenderer, self.arch), None),
|
||||
(functools.partial(PTXRenderer, self.arch, device="NV"), NV_PTX),
|
||||
(functools.partial(NAKRenderer, self.arch, self.max_warps_per_sm), NV_NAK)])
|
||||
super().__init__(device, NVAllocator(self), compilers, functools.partial(NVProgram, self), NVSignal, NVComputeQueue, NVCopyQueue)
|
||||
|
||||
self.pma_enabled = PMA.value > 0 and PROFILE >= 1
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ from typing import Any, TYPE_CHECKING
|
|||
import pickle, base64, itertools, time, struct, sys, functools
|
||||
from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate, float_to_fp16, float_to_bf16, float_to_fp8, fp8_to_float
|
||||
from tinygrad.helpers import all_same, getenv, flatten, get_single_element, EMULATE
|
||||
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet, CompilerPair
|
||||
from tinygrad.device import Compiled, Compiler, Allocator, CompilerSet
|
||||
from tinygrad.codegen.opt import tc
|
||||
from tinygrad.uop.ops import exec_alu, python_alu, Ops, UOp, GroupOp
|
||||
from tinygrad.renderer import Renderer
|
||||
|
|
@ -248,4 +248,4 @@ class PythonAllocator(Allocator['PythonDevice']):
|
|||
|
||||
class PythonDevice(Compiled):
|
||||
def __init__(self, device:str):
|
||||
super().__init__(device, PythonAllocator(self), CompilerSet([CompilerPair(PythonRenderer, None)]), PythonProgram)
|
||||
super().__init__(device, PythonAllocator(self), CompilerSet([(PythonRenderer, None)]), PythonProgram)
|
||||
|
|
|
|||
|
|
@ -2,15 +2,15 @@ from __future__ import annotations
|
|||
import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
|
||||
assert sys.platform != 'win32'
|
||||
from typing import Any
|
||||
from tinygrad.device import BufferSpec, CompilerSet, CompilerPair, Device
|
||||
from tinygrad.device import BufferSpec, CompilerSet, Device
|
||||
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
|
||||
from tinygrad.runtime.autogen import kgsl, mesa
|
||||
from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
|
||||
from tinygrad.runtime.ops_cl import CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
from tinygrad.renderer.nir import IR3Renderer
|
||||
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, ceildiv, prod, fromimport, cpu_profile, lo32, suppress_finalizing
|
||||
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE
|
||||
from tinygrad.helpers import next_power2, flatten, QCOM_IR3, QCOM_CC, PROFILE, DEBUG
|
||||
from tinygrad.dtype import ImageDType, dtypes
|
||||
from tinygrad.runtime.support.system import System
|
||||
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
|
@ -49,10 +49,6 @@ def pkt7_hdr(opcode: int, cnt: int): return mesa.CP_TYPE7_PKT | cnt & 0x3FFF | p
|
|||
def pkt4_hdr(reg: int, cnt: int): return mesa.CP_TYPE4_PKT | cnt & 0x7F | parity(cnt) << 7 | (reg & 0x3FFFF) << 8 | parity(reg) << 27
|
||||
|
||||
def _read_lib(lib, off) -> int: return struct.unpack("I", lib[off:off+4])[0]
|
||||
class QCOMCompiler(CLCompiler):
|
||||
def __init__(self, device:str=""): super().__init__(CLDevice(device), 'compile_qcom')
|
||||
def disassemble(self, lib:bytes):
|
||||
fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
|
||||
|
||||
class QCOMSignal(HCQSignal):
|
||||
def __init__(self, *args, **kwargs): super().__init__(*args, **{**kwargs, 'timestamp_divider': 19.2})
|
||||
|
|
@ -231,7 +227,7 @@ class QCOMArgsState(HCQArgsState):
|
|||
class QCOMProgram(HCQProgram):
|
||||
def __init__(self, dev: QCOMDevice, name: str, lib: bytes, buf_dtypes=[], **kwargs):
|
||||
self.dev: QCOMDevice = dev
|
||||
self.buf_dtypes, self.name, self.lib, self.NIR = buf_dtypes, name, lib, isinstance(dev.renderer, IR3Renderer)
|
||||
self.buf_dtypes, self.name, self.NIR = buf_dtypes, name, isinstance(dev.renderer, IR3Renderer)
|
||||
|
||||
if self.NIR:
|
||||
from tinygrad.runtime.support.compiler_mesa import IR3Compiler
|
||||
|
|
@ -252,7 +248,9 @@ class QCOMProgram(HCQProgram):
|
|||
self.tex_off, self.ibo_off, self.samp_off = 2048, 2048 + 0x40 * self.tex_cnt, 2048 + 0x40 * (self.tex_cnt + self.ibo_cnt)
|
||||
self.fregs, self.hregs = v.info.max_reg + 1, v.info.max_half_reg + 1
|
||||
self.consts_info:list[tuple] = []
|
||||
else: self._parse_lib()
|
||||
else:
|
||||
self._parse_lib(lib:=self.dev.cl_dev.clc.compile_cached(lib.decode()))
|
||||
if DEBUG >= 7: fromimport('tinygrad.runtime.support.compiler_mesa', 'disas_adreno')(lib[(ofs:=_read_lib(lib, 0xc0)):ofs+_read_lib(lib, 0x100)])
|
||||
|
||||
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, buf_spec:=BufferSpec(cpu_access=True, nolru=True))
|
||||
to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
|
||||
|
|
@ -274,21 +272,21 @@ class QCOMProgram(HCQProgram):
|
|||
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
||||
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
||||
|
||||
def _parse_lib(self):
|
||||
def _parse_lib(self, lib):
|
||||
# Extract image binary
|
||||
self.image_size = _read_lib(self.lib, 0x100)
|
||||
self.image = bytearray(self.lib[(image_offset:=_read_lib(self.lib, 0xc0)):image_offset+self.image_size])
|
||||
self.image_size = _read_lib(lib, 0x100)
|
||||
self.image = bytearray(lib[(image_offset:=_read_lib(lib, 0xc0)):image_offset+self.image_size])
|
||||
|
||||
# Parse image descriptors
|
||||
image_desc_off = _read_lib(self.lib, 0x110)
|
||||
self.prg_offset, self.brnchstck = _read_lib(self.lib, image_desc_off+0xc4), _read_lib(self.lib, image_desc_off+0x108) // 2
|
||||
self.pvtmem, self.shmem = _read_lib(self.lib, image_desc_off+0xc8), _read_lib(self.lib, image_desc_off+0xd8)
|
||||
image_desc_off = _read_lib(lib, 0x110)
|
||||
self.prg_offset, self.brnchstck = _read_lib(lib, image_desc_off+0xc4), _read_lib(lib, image_desc_off+0x108) // 2
|
||||
self.pvtmem, self.shmem = _read_lib(lib, image_desc_off+0xc8), _read_lib(lib, image_desc_off+0xd8)
|
||||
|
||||
# Fill up constants and buffers info
|
||||
self.consts_info = []
|
||||
|
||||
# Collect sampler info.
|
||||
self.samp_cnt = samp_cnt_in_file = _read_lib(self.lib, image_desc_off + 0xdc)
|
||||
self.samp_cnt = samp_cnt_in_file = _read_lib(lib, image_desc_off + 0xdc)
|
||||
assert self.samp_cnt <= 1, "Up to one sampler supported"
|
||||
if self.samp_cnt:
|
||||
self.samp_cnt += 1
|
||||
|
|
@ -298,8 +296,8 @@ class QCOMProgram(HCQProgram):
|
|||
|
||||
# Collect kernel arguments (buffers) info.
|
||||
bdoff, binfos = round_up(image_desc_off + 0x158 + len(self.name), 4) + 8 * samp_cnt_in_file, []
|
||||
while bdoff + 32 <= len(self.lib):
|
||||
length, _, _, offset_words, _, _, _, typ = struct.unpack("8I", self.lib[bdoff:bdoff+32])
|
||||
while bdoff + 32 <= len(lib):
|
||||
length, _, _, offset_words, _, _, _, typ = struct.unpack("8I", lib[bdoff:bdoff+32])
|
||||
if length == 0: break
|
||||
binfos.append((offset_words * 4, typ))
|
||||
bdoff += length
|
||||
|
|
@ -309,16 +307,16 @@ class QCOMProgram(HCQProgram):
|
|||
self.tex_cnt, self.ibo_cnt = sum(typ is BUFTYPE_TEX for _,typ in binfos), sum(typ is BUFTYPE_IBO for _,typ in binfos)
|
||||
self.ibo_off, self.tex_off, self.samp_off = 2048, 2048 + 0x40 * self.ibo_cnt, 2048 + 0x40 * self.tex_cnt + 0x40 * self.ibo_cnt
|
||||
|
||||
if _read_lib(self.lib, 0xb0) != 0: # check if we have constants.
|
||||
cdoff = _read_lib(self.lib, 0xac)
|
||||
if _read_lib(lib, 0xb0) != 0: # check if we have constants.
|
||||
cdoff = _read_lib(lib, 0xac)
|
||||
while cdoff + 40 <= image_offset:
|
||||
cnst, offset_words, _, is32 = struct.unpack("I", self.lib[cdoff:cdoff+4])[0], *struct.unpack("III", self.lib[cdoff+16:cdoff+28])
|
||||
cnst, offset_words, _, is32 = struct.unpack("I", lib[cdoff:cdoff+4])[0], *struct.unpack("III", lib[cdoff+16:cdoff+28])
|
||||
self.consts_info.append((cnst, offset_words * (sz_bytes:=(2 << is32)), sz_bytes))
|
||||
cdoff += 40
|
||||
|
||||
# Registers info
|
||||
reg_desc_off = _read_lib(self.lib, 0x34)
|
||||
self.fregs, self.hregs = _read_lib(self.lib, reg_desc_off + 0x14), _read_lib(self.lib, reg_desc_off + 0x18)
|
||||
reg_desc_off = _read_lib(lib, 0x34)
|
||||
self.fregs, self.hregs = _read_lib(lib, reg_desc_off + 0x14), _read_lib(lib, reg_desc_off + 0x18)
|
||||
|
||||
class QCOMTextureInfo:
|
||||
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
|
||||
|
|
@ -385,8 +383,8 @@ class QCOMDevice(HCQCompiled):
|
|||
if PROFILE and self.gpu_id[:2] < (7, 3):
|
||||
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
|
||||
|
||||
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[CompilerPair(QCOMRenderer, functools.partial(QCOMCompiler, device)),
|
||||
CompilerPair(functools.partial(IR3Renderer, info.chip_id), None, QCOM_IR3)])
|
||||
self.cl_dev = CLDevice(device)
|
||||
compilers = CompilerSet(ctrl_var=QCOM_CC, cset=[(QCOMRenderer, None), (functools.partial(IR3Renderer, info.chip_id), QCOM_IR3)])
|
||||
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
|
||||
functools.partial(QCOMComputeQueue, self), None)
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import functools, struct
|
||||
from tinygrad.device import Compiled, Allocator, Compiler, BufferSpec, CompilerSet, CompilerPair
|
||||
from tinygrad.device import Compiled, Allocator, BufferSpec, CompilerSet
|
||||
from tinygrad.renderer.wgsl import WGSLRenderer
|
||||
from tinygrad.helpers import round_up, suppress_finalizing
|
||||
from tinygrad.runtime.autogen import webgpu
|
||||
|
|
@ -217,7 +217,7 @@ class WebGpuDevice(Compiled):
|
|||
self.device_res = _run(webgpu.wgpuAdapterRequestDeviceF, webgpu.WGPURequestDeviceCallbackInfo, webgpu.WGPURequestDeviceCallback,
|
||||
webgpu.WGPURequestDeviceStatus, 1, 2, adapter_res, dev_desc)
|
||||
|
||||
super().__init__(device, WebGpuAllocator(self), CompilerSet([CompilerPair(WGSLRenderer, Compiler)]),
|
||||
super().__init__(device, WebGpuAllocator(self), CompilerSet([(WGSLRenderer, None)]),
|
||||
functools.partial(WebGPUProgram, (self.device_res, webgpu.WGPUFeatureName_TimestampQuery in supported)))
|
||||
|
||||
def synchronize(self):
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue