mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
292 lines
8.8 KiB
Python
292 lines
8.8 KiB
Python
# inspired by https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py
|
|
from inspect import signature
|
|
import numpy as np
|
|
import os
|
|
from collections import defaultdict
|
|
|
|
# **** profiler ****
|
|
|
|
DEBUG = os.getenv("DEBUG", None) is not None
|
|
if DEBUG:
|
|
import atexit, time
|
|
debug_counts, debug_times = defaultdict(int), defaultdict(float)
|
|
def print_debug_exit():
|
|
for name, _ in sorted(debug_times.items(), key=lambda x: -x[1]):
|
|
print(f"{name:>20} : {debug_counts[name]:>6} {debug_times[name]:>10.2f} ms")
|
|
atexit.register(print_debug_exit)
|
|
|
|
class ProfileOp:
|
|
def __init__(self, name, x, backward=False):
|
|
self.name = ("back_" if backward else "")+name
|
|
self.x = x
|
|
def __enter__(self):
|
|
if DEBUG: self.st = time.time()
|
|
def __exit__(self, *junk):
|
|
if DEBUG:
|
|
if cl_queue is not None:
|
|
cl_queue.finish()
|
|
et = (time.time()-self.st)*1000.
|
|
debug_counts[self.name] += 1
|
|
debug_times[self.name] += et
|
|
print(f"{self.name:>20} : {et:>7.2f} ms {[y.shape for y in self.x]}")
|
|
|
|
# **** GPU functions ****
|
|
|
|
cl_ctx, cl_queue = None, None
|
|
def require_init_gpu():
|
|
global cl_ctx, cl_queue
|
|
if cl_queue is None:
|
|
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.GPU)
|
|
if len(devices) == 0:
|
|
devices = cl.get_platforms()[0].get_devices(device_type=cl.device_type.CPU)
|
|
cl_ctx = cl.Context(devices=devices)
|
|
# this is an in-order command queue
|
|
cl_queue = cl.CommandQueue(cl_ctx)
|
|
|
|
class GPUBuffer:
|
|
def __init__(self, shape, hostbuf=None):
|
|
self.shape, self.dtype = tuple(shape), np.float32
|
|
self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
|
|
cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
|
|
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
|
|
|
|
def __repr__(self):
|
|
return f"<GPUBuffer with shape {self.shape!r}>"
|
|
|
|
# **** ANE functions ****
|
|
|
|
ane = None
|
|
def require_init_ane():
|
|
global ane
|
|
if ane is None:
|
|
import ane.lib.ane, tinygrad.ops_ane
|
|
ane = ane.lib.ane.ANE()
|
|
|
|
# **** start with two base classes, Tensor and Function ****
|
|
|
|
class Tensor:
|
|
did_float_warning = False
|
|
ops = defaultdict(dict)
|
|
|
|
CPU, GPU, ANE = 0, 1, 2
|
|
|
|
def __init__(self, data, gpu=None, requires_grad=True):
|
|
if "ANETensor" in str(type(data)):
|
|
self.device = Tensor.ANE
|
|
elif isinstance(data, list):
|
|
data = np.array(data, dtype=np.float32)
|
|
elif GPU and isinstance(data, GPUBuffer):
|
|
self.device = Tensor.GPU
|
|
elif not isinstance(data, np.ndarray):
|
|
raise TypeError(f"Error constructing tensor with {data!r}")
|
|
|
|
if isinstance(data, np.ndarray):
|
|
if data.dtype != np.float32 and not Tensor.did_float_warning:
|
|
# warning? float64 is actually needed for numerical jacobian
|
|
print(f"warning, {data.shape!r} isn't float32")
|
|
Tensor.did_float_warning = True
|
|
self.device = Tensor.CPU
|
|
|
|
self.data, self.grad, self.requires_grad = data, None, requires_grad
|
|
|
|
if gpu:
|
|
self.cuda_()
|
|
|
|
# internal variables used for autograd graph construction
|
|
self._ctx = None
|
|
|
|
def __repr__(self):
|
|
return f"Tensor {self.data!r} with grad {(self.grad.data if self.grad else None)!r}"
|
|
|
|
def assign(self, x):
|
|
self.data = x.data
|
|
|
|
@property
|
|
def shape(self):
|
|
return self.data.shape
|
|
|
|
@property
|
|
def dtype(self):
|
|
return self.data.dtype
|
|
|
|
# ***** creation helper functions *****
|
|
|
|
@classmethod
|
|
def zeros(cls, *shape, **kwargs):
|
|
return cls(np.zeros(shape, dtype=np.float32), **kwargs)
|
|
|
|
@classmethod
|
|
def ones(cls, *shape, **kwargs):
|
|
return cls(np.ones(shape, dtype=np.float32), **kwargs)
|
|
|
|
@classmethod
|
|
def randn(cls, *shape, **kwargs):
|
|
return cls(np.random.randn(*shape).astype(np.float32), **kwargs)
|
|
|
|
@classmethod
|
|
def uniform(cls, *shape, **kwargs):
|
|
return cls((np.random.uniform(-1., 1., size=shape)/np.sqrt(np.prod(shape))).astype(np.float32), **kwargs)
|
|
|
|
@classmethod
|
|
def eye(cls, dim, **kwargs):
|
|
return cls(np.eye(dim).astype(np.float32), **kwargs)
|
|
|
|
# ***** toposort and backward pass *****
|
|
|
|
def deepwalk(self, visited: set, nodes: list):
|
|
visited.add(self)
|
|
if self._ctx:
|
|
[i.deepwalk(visited, nodes) for i in self._ctx.parents if i not in visited]
|
|
nodes.append(self)
|
|
return nodes
|
|
|
|
def backward(self):
|
|
assert self.shape == (1,)
|
|
|
|
# fill in the first grad with one
|
|
# this is "implicit gradient creation"
|
|
self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), gpu=self.gpu, requires_grad=False)
|
|
|
|
for t0 in reversed(self.deepwalk(set(), [])):
|
|
assert (t0.grad is not None)
|
|
with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True):
|
|
grads = t0._ctx.backward(t0._ctx, t0.grad.data)
|
|
if len(t0._ctx.parents) == 1:
|
|
grads = [grads]
|
|
for t,g in zip(t0._ctx.parents, grads):
|
|
if g is not None:
|
|
assert g.shape == t.shape, \
|
|
f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
|
|
gt = Tensor(g, requires_grad=False)
|
|
t.grad = gt if t.grad is None else (t.grad + gt)
|
|
|
|
# ***** tinygrad supports CPU and GPU *****
|
|
|
|
def cpu(self):
|
|
if self.device == Tensor.GPU:
|
|
with ProfileOp("toCPU", [self]):
|
|
ret = Tensor(np.empty(self.shape, dtype=np.float32), gpu=False)
|
|
cl.enqueue_copy(cl_queue, ret.data, self.data.cl, is_blocking=True)
|
|
if self.grad:
|
|
ret.grad = self.grad.cpu()
|
|
return ret
|
|
elif self.device == Tensor.ANE:
|
|
return Tensor(self.data.data().astype(np.float32), gpu=False)
|
|
else:
|
|
return self
|
|
|
|
@property
|
|
def gpu(self):
|
|
return self.device == Tensor.GPU
|
|
|
|
def cuda_(self):
|
|
self.data = self.cuda().data
|
|
self.device = Tensor.GPU
|
|
|
|
def cuda(self):
|
|
if not GPU:
|
|
raise Exception("No GPU Support, install pyopencl")
|
|
if not self.gpu:
|
|
with ProfileOp("toGPU", [self]):
|
|
require_init_gpu()
|
|
ret = Tensor(GPUBuffer(self.shape, self.data))
|
|
if self.grad:
|
|
ret.grad = self.grad.cuda()
|
|
return ret
|
|
return self
|
|
|
|
def ane(self):
|
|
assert(not self.gpu)
|
|
require_init_ane()
|
|
ndata = ane.tensor(self.shape)
|
|
ndata.data()[:] = self.data
|
|
return Tensor(ndata)
|
|
|
|
def detach(self):
|
|
return Tensor(self.data, self.gpu)
|
|
|
|
# ***** non first class ops *****
|
|
|
|
def matmul(self, w):
|
|
return self.dot(w)
|
|
|
|
def mean(self, axis=None):
|
|
out = self.sum(axis=axis)
|
|
coeff = np.prod(out.shape)/np.prod(self.shape)
|
|
return out * coeff
|
|
|
|
def sqrt(self):
|
|
return self.pow(0.5)
|
|
|
|
def div(self, y):
|
|
return self * (y ** -1.0)
|
|
|
|
def swish(self):
|
|
return self * self.sigmoid()
|
|
|
|
def tanh(self):
|
|
return 2.0 * ((2.0 * self).sigmoid()) - 1.0
|
|
|
|
def leakyrelu(self, neg_slope=0.01):
|
|
return self.relu() - (-neg_slope*self).relu()
|
|
|
|
def dropout(self, p=0.5):
|
|
_mask = np.asarray(np.random.binomial(1, 1.0-p, size=self.shape), dtype=self.dtype)
|
|
ret = self * Tensor(_mask, requires_grad=False, gpu=self.gpu)
|
|
return ret.div(1.0 - p)
|
|
|
|
def abs(self):
|
|
return self.relu() + (-1.0*self).relu()
|
|
|
|
# An instantiation of the Function is the Context
|
|
class Function:
|
|
def __init__(self, *tensors):
|
|
self.parents = tensors
|
|
self.saved_tensors = []
|
|
|
|
def save_for_backward(self, *x):
|
|
self.saved_tensors.extend(x)
|
|
|
|
def apply(self, *x, **kwargs):
|
|
ctx = self(*x) # self - operation i.e 'add', 'sub', etc.
|
|
# use default params
|
|
params = signature(self.forward).parameters
|
|
for p in params.values():
|
|
if p.default is not p.empty:
|
|
setattr(ctx, p.name, p.default)
|
|
# overwrite with passed params
|
|
for k, v in kwargs.items():
|
|
setattr(ctx, k, v)
|
|
with ProfileOp(ctx.__class__.__name__, x):
|
|
ret = Tensor(self.forward(ctx, *[t.data for t in x], **kwargs),
|
|
requires_grad=any([t.requires_grad for t in x]))
|
|
if ret.requires_grad:
|
|
ret._ctx = ctx
|
|
return ret
|
|
|
|
def register(name, fxn, device=Tensor.CPU):
|
|
Tensor.ops[device][name] = fxn
|
|
def dispatch(*x, **kwargs):
|
|
tt = [arg for arg in x if isinstance(arg, Tensor)][0]
|
|
x = [Tensor(np.array([arg], dtype=tt.dtype), gpu=tt.gpu, requires_grad=False) if not isinstance(arg, Tensor) else arg for arg in x]
|
|
f = (Tensor.ops[tt.device])[name]
|
|
f.cl_ctx, f.cl_queue, f.ane = cl_ctx, cl_queue, ane
|
|
return f.apply(f, *x, **kwargs)
|
|
setattr(Tensor, name, dispatch)
|
|
# TODO: div is a second class op, so it doesn't work here
|
|
if name in ['add', 'sub', 'mul', 'pow']:
|
|
setattr(Tensor, f"__{name}__", dispatch)
|
|
setattr(Tensor, f"__i{name}__", lambda self,x: self.assign(dispatch(self,x)))
|
|
setattr(Tensor, f"__r{name}__", lambda self,x: dispatch(x,self))
|
|
|
|
# this registers all the operations
|
|
import tinygrad.ops_cpu
|
|
try:
|
|
import pyopencl as cl
|
|
# TODO: move this import to require_init_gpu?
|
|
import tinygrad.ops_gpu
|
|
GPU = True
|
|
except ImportError:
|
|
# no GPU support
|
|
GPU = False
|
|
|