os x profiling: this ratio is exact i believe

This commit is contained in:
George Hotz 2023-01-28 19:02:51 -08:00
commit b0df4d99a0
3 changed files with 52 additions and 6 deletions

View file

@ -9,6 +9,7 @@ from tinygrad.llops.ops_gpu import CL, CLProgram
from tinygrad.helpers import prod
from collections import defaultdict
import pyopencl as cl
from tinygrad.runtime.opencl import OSX_TIMING_RATIO
DEBUGCL = int(os.getenv("DEBUGCL", 0))
FLOAT16 = int(os.getenv("FLOAT16", 0))
@ -279,7 +280,7 @@ class Thneed:
if DEBUGCL >= 1:
total_runtime = 0
for i, ((prg, args), e) in enumerate(zip(self.cl_cache, events)):
runtime = (e.profile.end - e.profile.start)
runtime = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO
print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {(prg.op_estimate)/runtime:9.2f} GFLOPS {prg.options} -> {args[2].shape if hasattr(args[2], 'shape') else args[2].size}")
if (DEBUGCL >= 2 and int(os.getenv("PRINT_KERNEL", "-1")) == i) or DEBUGCL >= 3:
print(prg.prg)

View file

@ -0,0 +1,45 @@
from tinygrad.runtime.opencl import CLProgram, CL, CLBuffer
import time
N = 1000000
a = CLBuffer(N*4)
b = CLBuffer(N*4)
c = CLBuffer(N*4)
prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
int idx = get_global_id(0);
a[idx] = b[idx] + c[idx];
}""")
prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
t1 = time.monotonic_ns()
e1 = prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
CL.cl_queue.finish()
t2 = time.monotonic_ns()
time.sleep(3)
t3 = time.monotonic_ns()
e2 = prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
CL.cl_queue.finish()
t4 = time.monotonic_ns()
print(e1.profile.queued)
print(e1.profile.submit)
print(e1.profile.start)
print(e1.profile.end)
print(e1, e2)
print(t2-t1, e1.profile.end - e1.profile.start)
print(t4-t3, e2.profile.end - e2.profile.start)
print(t3-t2, e2.profile.queued-e1.profile.end)
print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
print("ratio since boot", t1/e1.profile.start)
print(e1.profile.start)
print(e1.profile.end)
print(e2.profile.start)
print(e2.profile.end)
import pdb
pdb.set_trace()

View file

@ -1,4 +1,4 @@
import os, functools, time, platform
import os, functools, platform
import numpy as np
import pyopencl as cl # type: ignore
from typing import Dict, Optional, Tuple, List
@ -6,6 +6,8 @@ from collections import defaultdict
from tinygrad.ops import DEBUG
OSX = platform.system() == "Darwin"
OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
CLCACHE = int(os.getenv("CLCACHE", "1"))
FLOAT16 = int(os.getenv("FLOAT16", "0"))
@ -73,14 +75,12 @@ class CLProgram:
def __call__(self, *args):
CL.kernel_count += 1
if DEBUG >= 4: print(args[0], args[1], self.prg)
if OSX and DEBUG >= 2: st = time.monotonic_ns()
if CL.CACHE is not None: CL.CACHE.append((self, args))
else: e = self.clprg(CL().cl_queue, *args)
if DEBUG >= 2:
CL.cl_queue.finish()
# NOTE: Profiling is (sadly) broken in OS X, so we take the real kernel time
# BOUNTY: will paypal $50 to anyone who fixes this
et = (time.monotonic_ns() - st) if OSX else (e.profile.end - e.profile.start)
# NOTE: Profiling is not in ns in OS X, we multiply by a computed ratio
et = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO
if DEBUG >= 1:
CL.time_sum += 0 if DEBUG <= 1 or CL.CACHE is not None else et
CL.ops_sum += self.op_estimate