mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
compile raise CompileError and skip only RuntimeError in multiprocess… (#4646)
* compile raise CompileError and skip only RuntimeError in multiprocess beam renderer error with multiprocess should not be skipped by beam * use `==` for dtype to dtype comparison * that needs to be is * typo
This commit is contained in:
parent
8a0d1ca7bb
commit
286b4dbdf2
14 changed files with 30 additions and 24 deletions
|
|
@ -79,7 +79,7 @@ def make_tensor(shape, dtype:dtypes, noncontiguous) -> Tensor:
|
|||
+---------------------------+------------+----------+
|
||||
"""
|
||||
contiguous = not noncontiguous
|
||||
if dtype is dtypes.bool: return Tensor.randint(shape=shape, low=0, high=2, contiguous=contiguous).cast(dtypes.bool)
|
||||
if dtype == dtypes.bool: return Tensor.randint(shape=shape, low=0, high=2, contiguous=contiguous).cast(dtypes.bool)
|
||||
elif dtype.is_unsigned(): return Tensor.randint(shape=shape, low=0, high=10, contiguous=contiguous).cast(dtype)
|
||||
elif dtype.is_int(): return Tensor.randint(shape=shape, low=-9, high=10, contiguous=contiguous).cast(dtype) # signed int
|
||||
elif dtype.is_float(): return Tensor.rand(shape=shape, low=-9, high=9, dtype=dtype, contiguous=contiguous)
|
||||
|
|
@ -452,7 +452,7 @@ class TestIndexing(unittest.TestCase):
|
|||
|
||||
def tensor_indices_to_np(tensor: Tensor, indices):
|
||||
npt = tensor.numpy()
|
||||
idxs = tuple(i.numpy().tolist() if isinstance(i, Tensor) and i.dtype is dtypes.int64 else
|
||||
idxs = tuple(i.numpy().tolist() if isinstance(i, Tensor) and i.dtype == dtypes.int64 else
|
||||
i for i in indices)
|
||||
return npt, idxs
|
||||
|
||||
|
|
|
|||
|
|
@ -113,7 +113,7 @@ class TestDType(unittest.TestCase):
|
|||
arr = np.asarray(data, dtype=dt)
|
||||
tin = Tensor(arr).numpy()
|
||||
tor = torch.as_tensor(arr).detach().numpy()
|
||||
assert dt is tin.dtype is tor.dtype, f"dtype mismatch: expected={dt} | tinygrad={tin.dtype} | torch={tor.dtype}"
|
||||
assert dt == tin.dtype == tor.dtype, f"dtype mismatch: expected={dt} | tinygrad={tin.dtype} | torch={tor.dtype}"
|
||||
np.testing.assert_allclose(tin, tor, atol=1e-6, rtol=1e-3)
|
||||
|
||||
def _test_ops(a_dtype:DType, b_dtype:DType, target_dtype=None):
|
||||
|
|
|
|||
|
|
@ -706,12 +706,12 @@ class TestSchedule(unittest.TestCase):
|
|||
a = shared * 2
|
||||
b = shared * 3
|
||||
sched = check_schedule([a, b], 1)
|
||||
for si in sched[:-2]: assert all(out.dtype is dtypes.half for out in si.outputs)
|
||||
for si in sched[:-2]: assert all(out.dtype == dtypes.half for out in si.outputs)
|
||||
|
||||
# reduce
|
||||
a = z.sum(axis=0).half().float().sum(axis=0)
|
||||
sched = check_schedule(a, 2)
|
||||
for si in sched[:-1]: assert all(out.dtype is dtypes.half for out in si.outputs)
|
||||
for si in sched[:-1]: assert all(out.dtype == dtypes.half for out in si.outputs)
|
||||
|
||||
# expand
|
||||
# expand will realize just after the .float(), so requires change to realize-before-expand
|
||||
|
|
|
|||
|
|
@ -224,7 +224,7 @@ constant_folder = PatternMatcher([
|
|||
({"uop": UOps.ALU, "arg": BinaryOps.CMPLT, "vin": ({"uop": UOps.ALU, "arg": UnaryOps.NEG, "vin": ({"__name__": "x"},)},
|
||||
{"__name__": "c", "uop": UOps.CONST, "dtype": dtypes.int})},
|
||||
lambda c,x: UOp(UOps.ALU, dtypes.bool, (UOp.const(c.dtype, -c.arg), x), BinaryOps.CMPLT)),
|
||||
# cast NOOP (NOTE: it's str to deal with PtrDType)
|
||||
# cast NOOP (NOTE: it's `is` to deal with PtrDType)
|
||||
({"__name__": "root", "uop": UOps.CAST}, lambda root: root.vin[0] if root.dtype is root.vin[0].dtype else None),
|
||||
])
|
||||
|
||||
|
|
|
|||
|
|
@ -164,6 +164,8 @@ MallocAllocator = _MallocAllocator()
|
|||
|
||||
# **************** for Compiled Devices ****************
|
||||
|
||||
class CompileError(Exception): pass
|
||||
|
||||
class Compiler:
|
||||
def __init__(self, cachekey:Optional[str]=None): self.cachekey = None if getenv("DISABLE_COMPILER_CACHE") else cachekey
|
||||
def compile(self, src:str) -> bytes: raise NotImplementedError("need a compile function")
|
||||
|
|
|
|||
|
|
@ -59,7 +59,7 @@ def _try_compile_linearized_w_idx(x:Tuple[int,Linearizer], compiler:Compiler) ->
|
|||
prog = compiler.compile(p.src)
|
||||
et = time.perf_counter() - st
|
||||
return x[0], (p, prog, et)
|
||||
except Exception:
|
||||
except RuntimeError:
|
||||
if DEBUG >= 4: traceback.print_exc()
|
||||
return x[0], None
|
||||
|
||||
|
|
|
|||
|
|
@ -76,7 +76,7 @@ class PTXRenderer(Renderer):
|
|||
def mem_type(self, dtype): return 's8' if dtype.itemsize == 1 else 'b16' if dtype == dtypes.float16 else self.types[dtype]
|
||||
|
||||
def render_load(self, loc, dest, dtype, gate=None, alt=None, ss="", offset=0) -> List[str]:
|
||||
assert dtype is not dtypes.bool
|
||||
assert dtype != dtypes.bool
|
||||
if gate: return [f"@{gate} ld{ss}.{self.mem_type(dtype)} {dest}, [{loc}+{offset}];", f"@!{gate} mov.b{self.types[dtype][1:]} {dest}, {alt};"]
|
||||
return [f"ld{ss}.{self.mem_type(dtype)} {dest}, [{loc}+{offset}];"]
|
||||
|
||||
|
|
@ -154,7 +154,7 @@ class PTXRenderer(Renderer):
|
|||
kk(f"{r_label[vin[0]]}:")
|
||||
elif uop is UOps.STORE:
|
||||
assert vin[0].dtype is not None and vin[2].dtype is not None
|
||||
assert vin[0].dtype is dtypes.int64, "store isn't int64"
|
||||
assert vin[0].dtype == dtypes.int64, "store isn't int64"
|
||||
assert vin[1].uop is UOps.CONST, f"store isn't const {u}"
|
||||
mem_type = '.shared' if vin[0].uop is UOps.DEFINE_LOCAL or any(x.uop is UOps.DEFINE_LOCAL for x in vin[0].parents) else '.global'
|
||||
if vin[2].dtype.count > 1:
|
||||
|
|
@ -187,7 +187,7 @@ class PTXRenderer(Renderer):
|
|||
else: r[u] = const(args, dtype, mov=True)
|
||||
elif uop is UOps.GEP: r[u] = r[vin[0]][u.arg]
|
||||
elif uop is UOps.LOAD:
|
||||
assert vin[0].dtype is dtypes.int64, "load isn't int64"
|
||||
assert vin[0].dtype == dtypes.int64, "load isn't int64"
|
||||
assert vin[1].uop is UOps.CONST, f"load isn't const {u}"
|
||||
mem_type = '.shared' if vin[0].uop is UOps.DEFINE_LOCAL or any(x.uop is UOps.DEFINE_LOCAL for x in vin[0].parents) else '.global'
|
||||
if dtype.count > 1:
|
||||
|
|
|
|||
|
|
@ -24,7 +24,7 @@ class CStyleLanguage(Renderer):
|
|||
uses_ptr_arithmetic: bool = False
|
||||
type_map: Dict[DType, str] = {}
|
||||
code_for_op: Dict = {
|
||||
UnaryOps.NEG: lambda x,dtype: f"(!{x})" if dtype is dtypes.bool else f"(-{x})", UnaryOps.SQRT: lambda x,dtype: f"sqrt({x})",
|
||||
UnaryOps.NEG: lambda x,dtype: f"(!{x})" if dtype == dtypes.bool else f"(-{x})", UnaryOps.SQRT: lambda x,dtype: f"sqrt({x})",
|
||||
UnaryOps.EXP2: lambda x,dtype: f"exp2({x})", UnaryOps.LOG2: lambda x,dtype: f"log2({x})", UnaryOps.SIN: lambda x,dtype: f"sin({x})",
|
||||
BinaryOps.ADD: lambda a,b,dtype: f"({a}+{b})", BinaryOps.SUB: lambda a,b,dtype: f"({a}-{b})", BinaryOps.MUL: lambda a,b,dtype: f"({a}*{b})",
|
||||
BinaryOps.DIV: lambda a,b,dtype: f"({a}/{b})", BinaryOps.MAX: lambda a,b,dtype: f"max({a},{b})", BinaryOps.MOD: lambda a,b,dtype: f"({a}%{b})",
|
||||
|
|
@ -43,8 +43,8 @@ class CStyleLanguage(Renderer):
|
|||
def render_const(self, x:ConstType, dtype:DType) -> str:
|
||||
if math.isnan(x): val = "NAN"
|
||||
elif math.isinf(x): val = ("-" if x < 0 else "") + "INFINITY"
|
||||
elif dtype is dtypes.bool: val = "1" if x else "0"
|
||||
elif dtype is dtypes.float: val = f"{x}f"
|
||||
elif dtype == dtypes.bool: val = "1" if x else "0"
|
||||
elif dtype == dtypes.float: val = f"{x}f"
|
||||
else: val = str(x)
|
||||
return (self.render_cast([val] * dtype.count, dtype) if dtype.count > 1 or dtype not in [dtypes.float, dtypes.int, dtypes.bool] else val)
|
||||
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ def is_bool_or_unsigned(dtype: DType): return dtype == dtypes.bool or dtypes.is_
|
|||
|
||||
code_for_op: Final[Dict[Op, Callable]] = {
|
||||
UnaryOps.NEG: lambda builder, x, dtype: builder.neg(x) if dtypes.is_int(dtype) else \
|
||||
(builder.not_(x) if dtype is dtypes.bool else builder.fneg(x, flags=MFLAGS)),
|
||||
(builder.not_(x) if dtype == dtypes.bool else builder.fneg(x, flags=MFLAGS)),
|
||||
UnaryOps.EXP2: lambda builder, x, dtype: builder.call(builder.module.declare_intrinsic('llvm.exp2', [x.type]), [x], fastmath=MFLAGS),
|
||||
UnaryOps.LOG2: lambda builder, x, dtype: builder.call(builder.module.declare_intrinsic('llvm.log2', [x.type]), [x], fastmath=MFLAGS),
|
||||
UnaryOps.SIN: lambda builder, x, dtype: builder.call(builder.module.declare_intrinsic('llvm.sin', [x.type]), [x], fastmath=MFLAGS),
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
from typing import Tuple, List, Any, cast
|
||||
import os, fcntl, ctypes, ctypes.util, functools, re, pathlib, mmap, struct, errno, subprocess, time
|
||||
from tinygrad.device import Compiled, Compiler, BufferOptions, LRUAllocator
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, DEBUG
|
||||
from tinygrad.renderer.cstyle import AMDRenderer
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
|
|
@ -77,7 +77,9 @@ class AMDCompiler(Compiler):
|
|||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_hip_{self.arch}")
|
||||
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
|
||||
def compile(self, src:str) -> bytes:
|
||||
try: return compile_hip(src, self.arch)
|
||||
except RuntimeError as e: raise CompileError(e)
|
||||
|
||||
PAGE_SIZE = 0x1000
|
||||
SIGNAL_SIZE, SIGNAL_COUNT = ctypes.sizeof(hsa.amd_signal_t), 16384
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ from pathlib import Path
|
|||
from typing import Tuple, Optional, List
|
||||
import tinygrad.runtime.autogen.cuda as cuda
|
||||
from tinygrad.helpers import DEBUG, getenv, from_mv, to_char_p_p, init_c_var, init_c_struct_t, colored, cpu_time_execution
|
||||
from tinygrad.device import Compiled, Compiler, BufferOptions, LRUAllocator, MallocAllocator
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator, MallocAllocator
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from tinygrad.renderer.assembly import PTXRenderer
|
||||
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401
|
||||
|
|
@ -71,7 +71,7 @@ class CUDACompiler(Compiler):
|
|||
check(cuda.nvrtcCreateProgram(ctypes.byref(prog := cuda.nvrtcProgram()), src.encode(), "<null>".encode(), 0, None, None))
|
||||
status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
|
||||
|
||||
if status != 0: raise RuntimeError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
|
||||
if status != 0: raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, check).decode()}")
|
||||
return _get_bytes(prog, cuda.nvrtcGetPTX, cuda.nvrtcGetPTXSize, check)
|
||||
|
||||
def cuda_disassemble(lib, arch):
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ import ctypes, functools, hashlib
|
|||
import tinygrad.runtime.autogen.opencl as cl
|
||||
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG
|
||||
from tinygrad.renderer.cstyle import OpenCLRenderer
|
||||
from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler
|
||||
from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
|
||||
|
||||
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
||||
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
|
||||
|
|
@ -23,7 +23,7 @@ class CLCompiler(Compiler):
|
|||
if build_status != 0:
|
||||
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
|
||||
cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
|
||||
raise RuntimeError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
|
||||
raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
|
||||
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(ctypes.c_size_t), binary_sizes := (ctypes.c_size_t * 1)(), None))
|
||||
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), (ctypes.c_void_p * 1)(ctypes.addressof(binary := ctypes.create_string_buffer(binary_sizes[0]))), None)) # noqa: E501
|
||||
check(cl.clReleaseProgram(program))
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ import ctypes, functools, subprocess, io, atexit, collections, json
|
|||
from typing import Tuple, TypeVar, List, Dict, Any
|
||||
import tinygrad.runtime.autogen.hsa as hsa
|
||||
from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
|
||||
from tinygrad.device import Compiled, Compiler, BufferOptions, LRUAllocator
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
|
||||
from tinygrad.renderer.cstyle import HIPRenderer
|
||||
from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
|
||||
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
||||
|
|
@ -45,7 +45,9 @@ class HSACompiler(Compiler):
|
|||
def __init__(self, arch:str):
|
||||
self.arch = arch
|
||||
super().__init__(f"compile_hip_{self.arch}")
|
||||
def compile(self, src:str) -> bytes: return compile_hip(src, self.arch)
|
||||
def compile(self, src:str) -> bytes:
|
||||
try: return compile_hip(src, self.arch)
|
||||
except RuntimeError as e: raise CompileError(e)
|
||||
|
||||
class HSAProgram:
|
||||
def __init__(self, device:HSADevice, name:str, lib:bytes):
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from __future__ import annotations
|
||||
import os, ctypes, pathlib, re, fcntl, functools, mmap, struct, tempfile, hashlib, subprocess, time, array
|
||||
from typing import Tuple, List, Any, cast
|
||||
from tinygrad.device import Compiled, Compiler, LRUAllocator, BufferOptions
|
||||
from tinygrad.device import Compiled, Compiler, CompileError, LRUAllocator, BufferOptions
|
||||
from tinygrad.helpers import getenv, from_mv, init_c_struct_t, to_mv, round_up, to_char_p_p, DEBUG, prod
|
||||
from tinygrad.renderer.cstyle import NVRenderer
|
||||
from tinygrad.runtime.ops_cuda import check as cuda_check, _get_bytes, CUDACompiler
|
||||
|
|
@ -80,7 +80,7 @@ class NVCompiler(Compiler):
|
|||
status = cuda.nvrtcCompileProgram(prog, len(self.compile_options), to_char_p_p([o.encode() for o in self.compile_options]))
|
||||
|
||||
if status != 0:
|
||||
raise RuntimeError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
|
||||
raise CompileError(f"compile failed: {_get_bytes(prog, cuda.nvrtcGetProgramLog, cuda.nvrtcGetProgramLogSize, cuda_check).decode()}")
|
||||
return _get_bytes(prog, cuda.nvrtcGetCUBIN, cuda.nvrtcGetCUBINSize, cuda_check)
|
||||
|
||||
class HWComputeQueue:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue