tinygrad/tinygrad/runtime/ops_gpu.py
George Hotz 2c363b5f0b
new style device (#2530)
* cpu tests pass

* torch works

* works

* metal works

* fix ops_disk

* metal jit works

* fix openpilot

* llvm and clang work

* fix webgpu

* docs are rly broken

* LRU works on metal

* delete comment

* revert name to ._buf. LRU only on Compiled

* changes

* allocator

* allocator, getting closer

* lru alloc

* LRUAllocator

* all pass

* metal

* cuda

* test examples

* linearizer

* test fixes

* fix custom + clean realize

* fix hip

* skip tests

* fix tests

* fix size=0

* fix MOCKHIP

* fix thneed

* copy better

* simple

* old style metal copy

* fix thneed

* np reshape

* give cuda a device
2023-11-30 17:07:16 -08:00

90 lines
No EOL
4.7 KiB
Python

from __future__ import annotations
import os
os.environ['PYOPENCL_NO_CACHE'] = '1'
import pathlib, functools
import numpy as np
import pyopencl as cl
from typing import Optional, List, Tuple
from tinygrad.helpers import DEBUG, getenv, prod, ImageDType, OSX, fromimport, diskcache, DType
from tinygrad.device import Compiled, LRUAllocator
from tinygrad.renderer.opencl import OpenCLRenderer
from tinygrad.codegen.kernel import LinearizerOptions
OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
# TODO: if you fork and exit the child process after creating anything with cl on AMD, it hangs on e.wait()
ROCM_LLVM_PATH = pathlib.Path("/opt/rocm/llvm/bin")
if DEBUG >= 6:
early_exec = fromimport("extra.helpers", "enable_early_exec")()
@diskcache
def compile_gpu(prg:str) -> bytes:
clprg = cl.Program(GPUDevice.compile_context, prg)
clprg.build()
return clprg.get_info(cl.program_info.BINARIES)[0]
class CLProgram:
def __init__(self, device:GPUDevice, name:str, prg:bytes, bufs:int=0, vars:int=0):
self.device, self.name, self.clprogram = device, name, cl.Program(device.ctx, [device.ctx.devices[0]], [prg])
self.clprogram.build()
self.clprg = self.clprogram.__getattr__(name)
if DEBUG >= 5 and not OSX:
device_name = self.device.ctx.devices[0].name
if 'Adreno' in device_name:
fromimport('disassemblers.adreno', 'disasm')(prg)
elif device_name.startswith('gfx'):
asm = early_exec(([ROCM_LLVM_PATH / "llvm-objdump", '-d', '-'], prg))
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
elif "NVIDIA" in device_name:
# print the PTX for NVIDIA.
print(prg.decode('utf-8'))
if vars > 0: self.clprg.set_scalar_arg_dtypes([None]*bufs + [np.int32]*vars)
@staticmethod
def max_work_group_size(): return GPUDevice.compile_context.devices[0].max_work_group_size if GPUDevice.compile_context is not None else 1024
def __call__(self, *bufs, global_size:Tuple[int,int,int], local_size:Optional[Tuple[int,int,int]]=None, wait=False) -> Optional[float]:
e = self.clprg(self.device.queue, [int(g*l) for g,l in zip(global_size, local_size)] if local_size is not None else global_size, local_size, *bufs)
if wait:
e.wait()
try:
return ((e.profile.end - e.profile.start) * OSX_TIMING_RATIO) * 1e-9
except cl.RuntimeError: # no profiling info available
return None
return None
class CLAllocator(LRUAllocator):
def __init__(self, device:GPUDevice):
self.events: List[cl.Event] = []
self.device = device
super().__init__()
def _alloc(self, size:int, dtype:DType):
if size == 0: return None
if isinstance(dtype, ImageDType):
# NOTE: the memory is a bit off here due to padding, it's buf.row_pitch * buf.height * 4 * dtype.itemsize
assert size == prod(dtype.shape), f"image size mismatch {size} != {dtype.shape}"
fmt = cl.ImageFormat(cl.channel_order.RGBA, {2: cl.channel_type.HALF_FLOAT, 4: cl.channel_type.FLOAT}[dtype.itemsize])
buf = cl.Image(self.device.ctx, cl.mem_flags.READ_WRITE, fmt, shape=(dtype.shape[1], dtype.shape[0]))
else:
buf = cl.Buffer(self.device.ctx, cl.mem_flags.READ_WRITE, size * dtype.itemsize)
return buf
def copyin(self, dest:cl.Buffer, src:memoryview): self.events.append(cl.enqueue_copy(self.device.queue, dest, src, is_blocking=False))
def copyout(self, dest:memoryview, src:cl.Buffer):
self.events.clear()
cl.enqueue_copy(self.device.queue, dest, src, is_blocking=True)
class GPUDevice(Compiled):
devices = None
compile_context = None
def __init__(self, device:str):
if GPUDevice.devices is None:
cl_platforms = cl.get_platforms()
platform_devices: List[List[cl.Device]] = [y for y in ([x.get_devices(device_type=cl.device_type.GPU) for x in cl_platforms] + [x.get_devices(device_type=cl.device_type.CPU) for x in cl_platforms]) if y]
GPUDevice.devices = [device for device in platform_devices[getenv('CL_PLATFORM', 0)] if device.name not in getenv('CL_EXCLUDE', "").split(",")]
if DEBUG >= 1: print(f"using devices: {[device.hashable_model_and_version_identifier for device in GPUDevice.devices]}")
self.device = int(device.split(":")[1]) if ":" in device else 0
self.ctx = cl.Context(devices=[GPUDevice.devices[self.device]])
if GPUDevice.compile_context is None: GPUDevice.compile_context = self.ctx
self.queue = cl.CommandQueue(self.ctx, device=self.ctx.devices[0], properties=cl.command_queue_properties.PROFILING_ENABLE)
super().__init__(CLAllocator(self), LinearizerOptions(), OpenCLRenderer, compile_gpu, functools.partial(CLProgram, self))
def synchronize(self): self.queue.finish()