tinygrad/tinygrad/tensor.py
2020-12-07 20:03:28 -08:00

267 lines
7.9 KiB
Python

# inspired by https://github.com/karpathy/micrograd/blob/master/micrograd/engine.py
from inspect import signature
import numpy as np
import os
# **** profiler ****
DEBUG = os.getenv("DEBUG", None) is not None
if DEBUG:
import collections, atexit, time
debug_counts = collections.defaultdict(int)
debug_times = collections.defaultdict(float)
def print_debug_exit():
for name, _ in sorted(debug_times.items(), key=lambda x: -x[1]):
print(f"{name:>20} : {debug_counts[name]:>6} {debug_times[name]:>10.2f} ms")
atexit.register(print_debug_exit)
class ProfileOp:
def __init__(self, name, x, backward=False):
self.name = ("back_" if backward else "")+name
self.x = x
def __enter__(self):
if DEBUG: self.st = time.time()
def __exit__(self, *junk):
if DEBUG:
if cl_queue is not None:
cl_queue.finish()
et = (time.time()-self.st)*1000.
debug_counts[self.name] += 1
debug_times[self.name] += et
print(f"{self.name:>20} : {et:>7.2f} ms {[y.shape for y in self.x]}")
# **** GPU functions ****
cl_ctx, cl_queue = None, None
def require_init_gpu():
global cl_ctx, cl_queue
if cl_queue is None:
cl_ctx = cl.create_some_context(interactive=False)
# this is an in-order command queue
cl_queue = cl.CommandQueue(cl_ctx)
class GPUBuffer:
def __init__(self, shape, hostbuf=None):
self.shape, self.dtype = tuple(shape), np.float32
self.cl = hostbuf.cl if isinstance(hostbuf, GPUBuffer) else \
cl.Buffer(cl_ctx, cl.mem_flags.READ_WRITE | (cl.mem_flags.COPY_HOST_PTR if hostbuf is not None else 0), 4*np.prod(shape),
hostbuf=hostbuf.astype(np.float32).ravel() if hostbuf is not None else None)
def __repr__(self):
return f"<GPUBuffer with shape {self.shape!r}>"
# **** start with two base classes, Tensor and Function ****
class Tensor:
did_float_warning = False
default_gpu = False
allocated = 0
ops, opsgpu = {}, {}
def __init__(self, data, gpu=None, requires_grad=True):
if gpu is None:
gpu = Tensor.default_gpu
if isinstance(data, list):
data = np.array(data, dtype=np.float32)
elif GPU and isinstance(data, GPUBuffer):
self.gpu = True
elif not isinstance(data, np.ndarray):
raise TypeError(f"Error constructing tensor with {data!r}")
if isinstance(data, np.ndarray):
if data.dtype != np.float32 and not Tensor.did_float_warning:
# warning? float64 is actually needed for numerical jacobian
print(f"warning, {data.shape!r} isn't float32")
Tensor.did_float_warning = True
self.gpu = False
self.data = data
self.grad = None
self.requires_grad = requires_grad
if gpu:
self.cuda_()
# internal variables used for autograd graph construction
self._ctx = None
Tensor.allocated += 1
def __del__(self):
#print("cleanup", self.shape)
Tensor.allocated -= 1
def __repr__(self):
return f"Tensor {self.data!r} with grad {(self.grad.data if self.grad else None)!r}"
def assign(self, x):
self.data = x.data
@property
def shape(self):
return self.data.shape
@property
def dtype(self):
return self.data.dtype
# ***** creation helper functions *****
@classmethod
def zeros(cls, *shape, **kwargs):
return cls(np.zeros(shape, dtype=np.float32), **kwargs)
@classmethod
def ones(cls, *shape, **kwargs):
return cls(np.ones(shape, dtype=np.float32), **kwargs)
@classmethod
def randn(cls, *shape, **kwargs):
return cls(np.random.randn(*shape).astype(np.float32), **kwargs)
@classmethod
def uniform(cls, *shape, **kwargs):
return cls((np.random.uniform(-1., 1., size=shape)/np.sqrt(np.prod(shape))).astype(np.float32), **kwargs)
@classmethod
def eye(cls, dim, **kwargs):
return cls(np.eye(dim).astype(np.float32), **kwargs)
# ***** toposort and backward pass *****
def deepwalk(self, visited=None, nodes=None):
if visited == None and nodes == None:
visited, nodes = set(), []
visited.add(self)
if self._ctx:
[i.deepwalk(visited, nodes) for i in self._ctx.parents if i not in visited]
nodes.append(self)
return nodes
def backward(self):
if self._ctx is None:
return
# fill in the first grad with one
# this is "implicit gradient creation"
assert self.shape == (1,)
self.grad = Tensor(np.ones(self.shape, dtype=self.dtype), gpu=self.gpu, requires_grad=False)
for t0 in reversed(self.deepwalk()):
assert (t0.grad is not None)
with ProfileOp(t0._ctx.__class__.__name__, [t0.grad], backward=True):
grads = t0._ctx.backward(t0._ctx, t0.grad.data)
if len(t0._ctx.parents) == 1:
grads = [grads]
for t,g in zip(t0._ctx.parents, grads):
if g is None:
continue
assert g.shape == t.shape, \
f"grad shape must match tensor shape in {self._ctx!r}, {g.shape!r} != {t.shape!r}"
gt = Tensor(g, requires_grad=False)
t.grad = gt if t.grad is None else (t.grad + gt)
# ***** tinygrad supports CPU and GPU *****
def cpu(self):
if self.gpu:
ret = Tensor(np.empty(self.shape, dtype=np.float32), gpu=False)
cl.enqueue_copy(cl_queue, ret.data, self.data.cl, is_blocking=True)
if self.grad:
ret.grad = self.grad.cpu()
return ret
else:
return self
def cuda_(self):
self.data = self.cuda().data
self.gpu = True
def cuda(self):
if not GPU:
raise Exception("No GPU Support, install pyopencl")
if not self.gpu:
require_init_gpu()
ret = Tensor(GPUBuffer(self.shape, self.data))
if self.grad:
ret.grad = self.grad.cuda()
return ret
else:
return self
def detach(self):
return Tensor(self.data, self.gpu)
# ***** non first class ops *****
def mean(self):
div = Tensor(np.array([1/np.prod(self.shape)], dtype=self.dtype), gpu=self.gpu, requires_grad=False)
return self.sum().mul(div)
def sqrt(self):
root = Tensor(np.zeros(self.shape, dtype=self.dtype)+0.5, gpu=self.gpu, requires_grad=False)
return self.pow(root)
def div(self, y):
root = Tensor(np.zeros(self.shape, dtype=self.dtype)-1, gpu=self.gpu, requires_grad=False)
return self.mul(y.pow(root))
def swish(self):
return self.mul(self.sigmoid())
def tanh(self):
t2 = Tensor(np.zeros(self.shape, dtype=self.dtype)+2, gpu=self.gpu, requires_grad=False)
t1 = Tensor(np.zeros(self.shape, dtype=self.dtype)+1, gpu=self.gpu, requires_grad=False)
return self.mul(t2).sigmoid().mul(t2) - t1 # 2*sigmoid(2*x)-1
# An instantiation of the Function is the Context
class Function:
def __init__(self, *tensors):
self.parents = tensors
self.saved_tensors = []
def save_for_backward(self, *x):
self.saved_tensors.extend(x)
def apply(self, *x, **kwargs):
op = self
ctx = op(*x)
# use default params
params = signature(op.forward).parameters
for p in params.values():
if p.default is not p.empty:
setattr(ctx, p.name, p.default)
# overwrite with passed params
for k, v in kwargs.items():
setattr(ctx, k, v)
with ProfileOp(ctx.__class__.__name__, x):
ret = Tensor(op.forward(ctx, *[t.data for t in x], **kwargs),
requires_grad=any([t.requires_grad for t in x]))
if ret.requires_grad:
ret._ctx = ctx
return ret
def register(name, fxn, gpu=False):
if gpu:
Tensor.opsgpu[name] = fxn
else:
Tensor.ops[name] = fxn
def dispatch(*x, **kwargs):
f = (Tensor.opsgpu if x[0].gpu else Tensor.ops)[name]
f.cl_ctx, f.cl_queue = cl_ctx, cl_queue
return f.apply(f, *x, **kwargs)
setattr(Tensor, name, dispatch)
if name in ['add', 'sub', 'mul', 'div', 'pow']:
setattr(Tensor, f"__{name}__", dispatch)
setattr(Tensor, f"__i{name}__", lambda self,x: self.assign(dispatch(self,x)))
# this registers all the operations
import tinygrad.ops
try:
import pyopencl as cl
import tinygrad.opsgpu
GPU = True
except ImportError:
# no GPU support
GPU = False