os x profiling: this ratio is exact i believe

2026-06-24 02:14:17 +00:00 · 2023-01-28 19:02:51 -08:00 · 2023-01-28 19:02:51 -08:00 · b0df4d99a0
commit b0df4d99a0
parent c0963b723e
3 changed files with 52 additions and 6 deletions
--- a/extra/thneed.py
+++ b/extra/thneed.py
@ -9,6 +9,7 @@ from tinygrad.llops.ops_gpu import CL, CLProgram
 from tinygrad.helpers import prod
 from collections import defaultdict
 import pyopencl as cl
+from tinygrad.runtime.opencl import OSX_TIMING_RATIO

 DEBUGCL = int(os.getenv("DEBUGCL", 0))
 FLOAT16 = int(os.getenv("FLOAT16", 0))
@ -279,7 +280,7 @@ class Thneed:
    if DEBUGCL >= 1:
      total_runtime = 0
      for i, ((prg, args), e) in enumerate(zip(self.cl_cache, events)):
-        runtime = (e.profile.end - e.profile.start)
+        runtime = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO
        print(f"{i:3d} time {total_runtime/1e6:5.2f} ms running {prg.name:20s} with {str(args[0]):15s} {str(args[1]):15s} count {len(args)-2:2d} runtime {runtime/1e3:7.2f} us {(prg.op_estimate)/runtime:9.2f} GFLOPS {prg.options} -> {args[2].shape if hasattr(args[2], 'shape') else args[2].size}")
        if (DEBUGCL >= 2 and int(os.getenv("PRINT_KERNEL", "-1")) == i) or DEBUGCL >= 3:
          print(prg.prg)
--- a/test/external_osx_profiling.py
+++ b/test/external_osx_profiling.py
@ -0,0 +1,45 @@
+from tinygrad.runtime.opencl import CLProgram, CL, CLBuffer
+import time
+
+
+N = 1000000
+a = CLBuffer(N*4)
+b = CLBuffer(N*4)
+c = CLBuffer(N*4)
+
+prg = CLProgram("test", """__kernel void test(__global float *a, __global float *b, __global float *c) {
+  int idx = get_global_id(0);
+  a[idx] = b[idx] + c[idx];
+}""")
+prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
+
+t1 = time.monotonic_ns()
+e1 = prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
+CL.cl_queue.finish()
+t2 = time.monotonic_ns()
+time.sleep(3)
+t3 = time.monotonic_ns()
+e2 = prg.clprg(CL().cl_queue, [N,], None, a.cl, b.cl, c.cl)
+CL.cl_queue.finish()
+t4 = time.monotonic_ns()
+
+print(e1.profile.queued)
+print(e1.profile.submit)
+print(e1.profile.start)
+print(e1.profile.end)
+
+print(e1, e2)
+print(t2-t1, e1.profile.end - e1.profile.start)
+print(t4-t3, e2.profile.end - e2.profile.start)
+print(t3-t2, e2.profile.queued-e1.profile.end)
+print((t3-t2) / (e2.profile.start-e1.profile.end), "ratio")
+
+print("ratio since boot", t1/e1.profile.start)
+
+print(e1.profile.start)
+print(e1.profile.end)
+print(e2.profile.start)
+print(e2.profile.end)
+
+import pdb
+pdb.set_trace()
--- a/tinygrad/runtime/opencl.py
+++ b/tinygrad/runtime/opencl.py
@ -1,4 +1,4 @@
-import os, functools, time, platform
+import os, functools, platform
 import numpy as np
 import pyopencl as cl  # type: ignore
 from typing import Dict, Optional, Tuple, List
@ -6,6 +6,8 @@ from collections import defaultdict
 from tinygrad.ops import DEBUG

 OSX = platform.system() == "Darwin"
+OSX_TIMING_RATIO = (125/3) if OSX else 1.0   # see test/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
+
 CLCACHE = int(os.getenv("CLCACHE", "1"))
 FLOAT16 = int(os.getenv("FLOAT16", "0"))

@ -73,14 +75,12 @@ class CLProgram:
  def __call__(self, *args):
    CL.kernel_count += 1
    if DEBUG >= 4: print(args[0], args[1], self.prg)
-    if OSX and DEBUG >= 2: st = time.monotonic_ns()
    if CL.CACHE is not None: CL.CACHE.append((self, args))
    else: e = self.clprg(CL().cl_queue, *args)
    if DEBUG >= 2:
      CL.cl_queue.finish()
-      # NOTE: Profiling is (sadly) broken in OS X, so we take the real kernel time
-      # BOUNTY: will paypal $50 to anyone who fixes this
-      et = (time.monotonic_ns() - st) if OSX else (e.profile.end - e.profile.start)
+      # NOTE: Profiling is not in ns in OS X, we multiply by a computed ratio
+      et = (e.profile.end - e.profile.start) * OSX_TIMING_RATIO
    if DEBUG >= 1:
      CL.time_sum += 0 if DEBUG <= 1 or CL.CACHE is not None else et
      CL.ops_sum += self.op_estimate