Merge branch 'master' into autogen-c

This commit is contained in:
George Hotz 2025-11-12 08:43:58 -08:00 committed by GitHub
commit bf0babeead
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 78 additions and 34 deletions

View file

@ -35,7 +35,7 @@ class InstInfo:
hit:int=0
lat:int=0
stall:int=0
def __str__(self): return f"{self.inst:>20} hits:{self.typ:>6} hits:{self.hit:>6} latency:{self.lat:>6} stall:{self.stall:>6}"
def __str__(self): return f"{self.inst:>20} type:{self.typ:>6} hits:{self.hit:>6} latency:{self.lat:>6} stall:{self.stall:>6}"
def on_ev(self, ev):
self.hit, self.lat, self.stall = self.hit + 1, self.lat + ev.duration, self.stall + ev.stall
@ -61,6 +61,8 @@ class WaveExec:
wave_id:int
cu:int
simd:int
begin_time:int
end_time:int
insts:list[InstExec]
class _ROCParseCtx:
@ -99,7 +101,7 @@ class _ROCParseCtx:
if ev.instructions_size > 0:
self.wave_events[key:=PrgExec(unwrap(self.active_kern), ev.wave_id, ev.cu, ev.simd)] = asm
self.inst_execs.setdefault(key.name, []).append(WaveExec(ev.wave_id, ev.cu, ev.simd, inst_execs))
self.inst_execs.setdefault(key.name, []).append(WaveExec(ev.wave_id, ev.cu, ev.simd, ev.begin_time, ev.end_time, inst_execs))
def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
dev_events:dict[str, ProfileDeviceEvent] = {}

View file

@ -1,7 +1,7 @@
import os
os.environ["PYTHONPATH"] = "."
os.environ["SQTT"] = "1"
os.environ["AMD"] = "1"
if "DEV" not in os.environ: os.environ["DEV"] = "AMD"
os.environ["VIZ"] = "1"
os.environ["AMD_LLVM"] = "0"
@ -16,7 +16,7 @@ from tinygrad.device import Device, ProfileDeviceEvent
from extra.sqtt.roc import decode, InstExec, PrgExec
dev = Device["AMD"]
dev = Device[os.environ["DEV"]]
def custom(arg:str, s:UOp|None=None) -> UOp: return UOp(Ops.CUSTOM, src=(s,) if s is not None else (), arg=arg)
@ -39,9 +39,10 @@ def save_sqtt():
sqtt:dict[PrgExec, list[InstExec]] = {}
yield sqtt
# decode sqtt
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
assert len(rctx.inst_execs) > 0, "empty sqtt output"
sqtt.update(rctx.inst_execs)
if os.environ["DEV"] == "AMD":
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
assert len(rctx.inst_execs) > 0, "empty sqtt output"
sqtt.update(rctx.inst_execs)
class TestTiming(unittest.TestCase):
def test_v_add(self):

View file

@ -436,6 +436,12 @@ class TestVizProfiler(BaseTestViz):
sz = len(get_profile(prof))
self.assertLessEqual(sz/n_events, 26)
def test_calltrace(self):
def fxn(): return Tensor.empty(10).mul(2).realize()
fxn()
trace = get_viz_list()[0]["steps"][0]["trace"]
assert any(fxn.__code__.co_filename == f and fxn.__code__.co_firstlineno == l for f,l,*_ in trace), str(trace)
# can pack up to 1hr 11 min of trace events
def test_trace_duration(self):
dur_mins = 72

View file

@ -240,11 +240,29 @@ class Profiling(contextlib.ContextDecorator):
def perf_counter_us() -> decimal.Decimal: return decimal.Decimal(time.perf_counter_ns())/1000
@functools.cache
def lines(fn) -> list[str]:
try:
with open(fn, encoding="utf-8") as f: return f.readlines()
except (FileNotFoundError, OSError): return []
def printable(loc:tuple[str, int]) -> str:
try: return lines(loc[0])[loc[1]-1].strip()
except IndexError: return "<missing>"
def get_stacktrace(frm, max_frames=30) -> tuple[tuple, ...]:
ret:list[tuple] = []
for i in range(max_frames):
if (frm:=frm.f_back) is None: break
ret.append(((fc:=frm.f_code).co_filename, frm.f_lineno, fc.co_name, printable((fc.co_filename, frm.f_lineno))))
return tuple(ret)
@dataclass(frozen=True)
class TracingKey:
display_name:str # display name of this trace event
keys:tuple[Any, ...]=() # optional keys to search for related traces
ret:Any=None
tb:tuple[tuple, ...]|None=field(default_factory=lambda: get_stacktrace(sys._getframe(1)) if VIZ else None)
class ProfileEvent: pass

View file

@ -1,5 +1,5 @@
from __future__ import annotations
import os, ctypes, functools, mmap, struct, array, math, sys, weakref
import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
assert sys.platform != 'win32'
from types import SimpleNamespace
from typing import Any, cast
@ -9,7 +9,8 @@ from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
from tinygrad.runtime.autogen import kgsl, adreno
from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
from tinygrad.renderer.cstyle import QCOMRenderer
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport, cpu_profile, lo32, PROFILE, colored
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport, cpu_profile, lo32, PROFILE
from tinygrad.runtime.support.system import System
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
@ -350,9 +351,8 @@ class QCOMDevice(HCQCompiled):
# a7xx start with 730x or 'Cxxx', a8xx starts 'Exxx'
if self.gpu_id[:2] >= (7, 3): raise RuntimeError(f"Unsupported GPU: chip_id={info.chip_id:#x}")
if PROFILE and self.gpu_id[:2] < (7, 3) and int(FileIOInterface('/sys/class/kgsl/kgsl-3d0/idle_timer', os.O_RDONLY).read(), 0) < 4000000000:
print(colored("WARNING: gpu can go into suspend mode and reset timestamps. "
"Run 'echo \"4294947000\" | sudo tee /sys/class/kgsl/kgsl-3d0/idle_timer' to prevent idle state.", "yellow"))
if PROFILE and self.gpu_id[:2] < (7, 3):
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
compilers = [(QCOMRenderer, functools.partial(QCOMCompiler, device))]
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
@ -378,3 +378,7 @@ class QCOMDevice(HCQCompiled):
self.synchronize()
self._gpu_free(self._stack)
self._stack = self._gpu_alloc(sz)
def _at_profile_finalize(self):
super()._at_profile_finalize()
with contextlib.suppress(RuntimeError): System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", "10", "Failed to reenable suspend mode")

View file

@ -12,6 +12,11 @@ MAP_FIXED, MAP_LOCKED, MAP_POPULATE, MAP_NORESERVE = 0x10, 0 if OSX else 0x2000,
class PCIBarInfo: addr:int; size:int # noqa: E702
class _System:
def write_sysfs(self, path:str, value:str, msg:str, expected:str|None=None):
if FileIOInterface(path, os.O_RDONLY).read().splitlines()[0] != (expected or value):
os.system(cmd:=f"sudo sh -c 'echo {value} > {path}'")
if FileIOInterface(path, os.O_RDONLY).read().splitlines()[0] != (expected or value): raise RuntimeError(f"{msg}. Please run {cmd} manually.")
@functools.cached_property
def atomic_lib(self): return ctypes.CDLL(ctypes.util.find_library('atomic')) if sys.platform == "linux" else None
@ -26,9 +31,7 @@ class _System:
@functools.cached_property
def pagemap(self) -> FileIOInterface:
if FileIOInterface(reloc_sysfs:="/proc/sys/vm/compact_unevictable_allowed", os.O_RDONLY).read()[0] != "0":
os.system(cmd:=f"sudo sh -c 'echo 0 > {reloc_sysfs}'")
assert FileIOInterface(reloc_sysfs, os.O_RDONLY).read()[0] == "0", f"Failed to disable migration of locked pages. Please run {cmd} manually."
self.write_sysfs("/proc/sys/vm/compact_unevictable_allowed", "0", "Failed to disable migration of locked pages")
return FileIOInterface("/proc/self/pagemap", os.O_RDONLY)
@functools.cached_property

View file

@ -8,7 +8,7 @@ from tinygrad.mixin import OpMixin
from tinygrad.dtype import ConstType, ImageDType, dtypes, DType, truncate, PtrDType, least_upper_dtype, Invalid, InvalidType, AddrSpace
from tinygrad.helpers import ContextVar, all_int, prod, getenv, all_same, Context, partition, temp, unwrap, T, argfix, Metadata, flatten, TRACEMETA
from tinygrad.helpers import PICKLE_BUFFERS, PROFILE, dedup, cdiv, cmod, diskcache_put, to_function_name, cpu_profile, TracingKey, VIZ, SPEC, CI
from tinygrad.helpers import strip_parens, colored, ansilen
from tinygrad.helpers import strip_parens, colored, ansilen, printable
if TYPE_CHECKING:
from tinygrad.device import Buffer, MultiBuffer
@ -865,14 +865,6 @@ def get_location() -> tuple[str, int]:
frm = frm.f_back
return frm.f_code.co_filename, frm.f_lineno
@functools.cache
def lines(fn) -> list[str]:
with open(fn) as f: return f.readlines()
def printable(loc:tuple[str, int]) -> str:
try: return lines(loc[0])[loc[1]-1].strip()
except FileNotFoundError: return "<missing>"
class UPat(OpMixin):
__slots__ = ("op", "dtype", "arg", "name", "src")
def __init__(self, op:Ops|tuple[Ops, ...]|set[Ops]|None=None, dtype:DType|tuple[DType, ...]|None=None,

View file

@ -551,6 +551,7 @@ document.getElementById("zoom-to-fit-btn").addEventListener("click", () => {
// **** main VIZ interfacae
const pathLink = (fp, lineno) => d3.create("a").attr("href", "vscode://file/"+fp+":"+lineno).text(`${fp.split("/").at(-1)}:${lineno}`);
function codeBlock(st, language, { loc, wrap }={}) {
const code = document.createElement("code");
// plaintext renders like a terminal print, otherwise render with syntax highlighting
@ -559,11 +560,7 @@ function codeBlock(st, language, { loc, wrap }={}) {
code.className = "hljs";
const ret = document.createElement("pre");
if (wrap) ret.className = "wrap";
if (loc != null) {
const link = ret.appendChild(document.createElement("a"));
link.href = "vscode://file/"+loc.join(":");
link.textContent = `${loc[0].split("/").at(-1)}:${loc[1]}`+"\n\n";
}
if (loc != null) ret.appendChild(pathLink(loc[0], loc[1]).style("margin-bottom", "4px").node());
ret.appendChild(code);
return ret;
}
@ -763,6 +760,15 @@ async function main() {
// ** right sidebar code blocks
const codeElement = codeBlock(ret[currentRewrite].uop, "python", { wrap:false });
metadata.replaceChildren(toggleLabel, codeBlock(step.code_line, "python", { loc:step.loc, wrap:true }), codeElement);
if (step.trace) {
const trace = d3.create("pre").append("code").classed("hljs", true);
for (let i=step.trace.length-1; i>=0; i--) {
const [fp, lineno, fn, code] = step.trace[i];
trace.append("div").style("margin-bottom", "2px").style("display","flex").text(fn+" ").append(() => pathLink(fp, lineno).node());
trace.append("div").html(hljs.highlight(code, { language: "python" }).value).style("margin-bottom", "1ex");
}
metadata.insertBefore(trace.node().parentNode, codeElement);
}
// ** rewrite steps
if (step.match_count >= 1) {
const rewriteList = metadata.appendChild(document.createElement("div"));

View file

@ -7,7 +7,8 @@ from http.server import BaseHTTPRequestHandler
from urllib.parse import parse_qs, urlparse
from typing import Any, TypedDict, TypeVar, Generator, Callable
from tinygrad.helpers import colored, getenv, tqdm, unwrap, word_wrap, TRACEMETA, ProfileEvent, ProfileRangeEvent, TracingKey, ProfilePointEvent, temp
from tinygrad.uop.ops import TrackedGraphRewrite, RewriteTrace, UOp, Ops, printable, GroupOp, srender, sint, sym_infer, range_str, pyrender
from tinygrad.helpers import printable
from tinygrad.uop.ops import TrackedGraphRewrite, RewriteTrace, UOp, Ops, GroupOp, srender, sint, sym_infer, range_str, pyrender
from tinygrad.uop.ops import print_uops, range_start, multirange_str
from tinygrad.device import ProfileDeviceEvent, ProfileGraphEvent, ProfileGraphEntry, Device
from tinygrad.renderer import ProgramSpec
@ -30,7 +31,7 @@ ref_map:dict[Any, int] = {}
def get_rewrites(t:RewriteTrace) -> list[dict]:
ret = []
for i,(k,v) in enumerate(zip(t.keys, t.rewrites)):
steps = [{"name":s.name, "loc":s.loc, "match_count":len(s.matches), "code_line":printable(s.loc),
steps = [{"name":s.name, "loc":s.loc, "match_count":len(s.matches), "code_line":printable(s.loc), "trace":k.tb if j == 0 else None,
"query":f"/ctxs?ctx={i}&idx={j}", "depth":s.depth} for j,s in enumerate(v)]
if isinstance(k.ret, ProgramSpec):
steps.append({"name":"View UOp List", "query":f"/render?ctx={i}&fmt=uops", "depth":0})
@ -216,12 +217,23 @@ def load_sqtt(profile:list[ProfileEvent]) -> None:
if (r:=ref_map.get(name)): name = ctxs[r]["name"]
steps.append({"name":name, "depth":0, "query":f"/render?ctx={len(ctxs)}&step={len(steps)}&fmt=counters",
"data":{"src":trace.keys[r].ret.src if r else name, "lang":"cpp"}})
# Idle: The total time gap between the completion of previous instruction and the beginning of the current instruction.
# The idle time can be caused by:
# * Arbiter loss
# * Source or destination register dependency
# * Instruction cache miss
# Stall: The total number of cycles the hardware pipe couldn't issue an instruction.
# Duration: Total latency in cycles, defined as "Stall time + Issue time" for gfx9 or "Stall time + Execute time" for gfx10+.
for w in waves:
rows = [(e.inst, e.time, e.time-(w.insts[i-1].time if i else 0), e.dur, e.stall, str(e.typ).split("_")[-1]) for i,e in enumerate(w.insts)]
summary = [{"label":"Total Cycles", "value":w.insts[-1].time-w.insts[0].time if w.insts else 0}, {"label":"CU", "value":w.cu},
rows, prev_instr = [], w.begin_time
for i,e in enumerate(w.insts):
rows.append((e.inst, e.time, max(0, e.time-prev_instr), e.dur, e.stall, str(e.typ).split("_")[-1]))
prev_instr = max(prev_instr, e.time + e.dur)
summary = [{"label":"Total Cycles", "value":w.end_time-w.begin_time}, {"label":"CU", "value":w.cu},
{"label":"SIMD", "value":w.simd}]
steps.append({"name":f"Wave {w.wave_id}", "depth":1, "query":f"/render?ctx={len(ctxs)}&step={len(steps)}&fmt=counters",
"data":{"rows":rows, "cols":["Instruction", "Clk", "Wait", "Duration", "Stall", "Type"], "summary":summary}})
"data":{"rows":rows, "cols":["Instruction", "Clk", "Idle", "Duration", "Stall", "Type"], "summary":summary}})
ctxs.append({"name":"Counters", "steps":steps})
def get_profile(profile:list[ProfileEvent]) -> bytes|None: