mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
os x profiling: this ratio is exact i believe
This commit is contained in:
parent
c0963b723e
commit
b0df4d99a0
3 changed files with 52 additions and 6 deletions
|
|
@ -9,6 +9,7 @@ from tinygrad.llops.ops_gpu import CL, CLProgram
|
|||
from tinygrad.helpers import prod
|
||||
from collections import defaultdict
|
||||
import pyopencl as cl
|
||||
from tinygrad.runtime.opencl import OSX_TIMING_RATIO
|
||||
|
||||
DEBUGCL = int(os.getenv("DEBUGCL", 0))
|
||||
FLOAT16 = int(os.getenv("FLOAT16", 0))
|
||||
|
|
@ -279,7 +280,7 @@ class Thneed:
|
|||
if DEBUGCL >= 1:
|
||||
total_runtime = 0
|
||||
for i, ((prg, args), e) in enumerate(zip(self.cl_cache, events)):
|
||||
runtime = (e.profile.end - e.profile.start)
|
||||
runtime = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO
|
||||
print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {(prg.op_estimate)/runtime:9.2f} GFLOPS {prg.options} -> {args[2].shape if hasattr(args[2], 'shape') else args[2].size}")
|
||||
if (DEBUGCL >= 2 and int(os.getenv("PRINT_KERNEL", "-1")) == i) or DEBUGCL >= 3:
|
||||
print(prg.prg)
|
||||
|
|
|
|||
45
test/external_osx_profiling.py
Normal file
45
test/external_osx_profiling.py
Normal file
|
|
@ -0,0 +1,45 @@
|
|||
from tinygrad.runtime.opencl import CLProgram, CL, CLBuffer
|
||||
import time
|
||||
|
||||
|
||||
N = 1000000
|
||||
a = CLBuffer(N*4)
|
||||
b = CLBuffer(N*4)
|
||||
c = CLBuffer(N*4)
|
||||
|
||||
prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
|
||||
int idx = get_global_id(0);
|
||||
a[idx] = b[idx] + c[idx];
|
||||
}""")
|
||||
prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
|
||||
|
||||
t1 = time.monotonic_ns()
|
||||
e1 = prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
|
||||
CL.cl_queue.finish()
|
||||
t2 = time.monotonic_ns()
|
||||
time.sleep(3)
|
||||
t3 = time.monotonic_ns()
|
||||
e2 = prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
|
||||
CL.cl_queue.finish()
|
||||
t4 = time.monotonic_ns()
|
||||
|
||||
print(e1.profile.queued)
|
||||
print(e1.profile.submit)
|
||||
print(e1.profile.start)
|
||||
print(e1.profile.end)
|
||||
|
||||
print(e1, e2)
|
||||
print(t2-t1, e1.profile.end - e1.profile.start)
|
||||
print(t4-t3, e2.profile.end - e2.profile.start)
|
||||
print(t3-t2, e2.profile.queued-e1.profile.end)
|
||||
print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
|
||||
|
||||
print("ratio since boot", t1/e1.profile.start)
|
||||
|
||||
print(e1.profile.start)
|
||||
print(e1.profile.end)
|
||||
print(e2.profile.start)
|
||||
print(e2.profile.end)
|
||||
|
||||
import pdb
|
||||
pdb.set_trace()
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
import os, functools, time, platform
|
||||
import os, functools, platform
|
||||
import numpy as np
|
||||
import pyopencl as cl # type: ignore
|
||||
from typing import Dict, Optional, Tuple, List
|
||||
|
|
@ -6,6 +6,8 @@ from collections import defaultdict
|
|||
from tinygrad.ops import DEBUG
|
||||
|
||||
OSX = platform.system() == "Darwin"
|
||||
OSX_TIMING_RATIO = (125/3) if OSX else 1.0 # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
||||
|
||||
CLCACHE = int(os.getenv("CLCACHE", "1"))
|
||||
FLOAT16 = int(os.getenv("FLOAT16", "0"))
|
||||
|
||||
|
|
@ -73,14 +75,12 @@ class CLProgram:
|
|||
def __call__(self, *args):
|
||||
CL.kernel_count += 1
|
||||
if DEBUG >= 4: print(args[0], args[1], self.prg)
|
||||
if OSX and DEBUG >= 2: st = time.monotonic_ns()
|
||||
if CL.CACHE is not None: CL.CACHE.append((self, args))
|
||||
else: e = self.clprg(CL().cl_queue, *args)
|
||||
if DEBUG >= 2:
|
||||
CL.cl_queue.finish()
|
||||
# NOTE: Profiling is (sadly) broken in OS X, so we take the real kernel time
|
||||
# BOUNTY: will paypal $50 to anyone who fixes this
|
||||
et = (time.monotonic_ns() - st) if OSX else (e.profile.end - e.profile.start)
|
||||
# NOTE: Profiling is not in ns in OS X, we multiply by a computed ratio
|
||||
et = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO
|
||||
if DEBUG >= 1:
|
||||
CL.time_sum += 0 if DEBUG <= 1 or CL.CACHE is not None else et
|
||||
CL.ops_sum += self.op_estimate
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue