mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Merge branch 'master' into autogen-c
This commit is contained in:
commit
bf0babeead
9 changed files with 78 additions and 34 deletions
|
|
@ -35,7 +35,7 @@ class InstInfo:
|
|||
hit:int=0
|
||||
lat:int=0
|
||||
stall:int=0
|
||||
def __str__(self): return f"{self.inst:>20} hits:{self.typ:>6} hits:{self.hit:>6} latency:{self.lat:>6} stall:{self.stall:>6}"
|
||||
def __str__(self): return f"{self.inst:>20} type:{self.typ:>6} hits:{self.hit:>6} latency:{self.lat:>6} stall:{self.stall:>6}"
|
||||
|
||||
def on_ev(self, ev):
|
||||
self.hit, self.lat, self.stall = self.hit + 1, self.lat + ev.duration, self.stall + ev.stall
|
||||
|
|
@ -61,6 +61,8 @@ class WaveExec:
|
|||
wave_id:int
|
||||
cu:int
|
||||
simd:int
|
||||
begin_time:int
|
||||
end_time:int
|
||||
insts:list[InstExec]
|
||||
|
||||
class _ROCParseCtx:
|
||||
|
|
@ -99,7 +101,7 @@ class _ROCParseCtx:
|
|||
|
||||
if ev.instructions_size > 0:
|
||||
self.wave_events[key:=PrgExec(unwrap(self.active_kern), ev.wave_id, ev.cu, ev.simd)] = asm
|
||||
self.inst_execs.setdefault(key.name, []).append(WaveExec(ev.wave_id, ev.cu, ev.simd, inst_execs))
|
||||
self.inst_execs.setdefault(key.name, []).append(WaveExec(ev.wave_id, ev.cu, ev.simd, ev.begin_time, ev.end_time, inst_execs))
|
||||
|
||||
def decode(profile:list[ProfileEvent]) -> _ROCParseCtx:
|
||||
dev_events:dict[str, ProfileDeviceEvent] = {}
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import os
|
||||
os.environ["PYTHONPATH"] = "."
|
||||
os.environ["SQTT"] = "1"
|
||||
os.environ["AMD"] = "1"
|
||||
if "DEV" not in os.environ: os.environ["DEV"] = "AMD"
|
||||
os.environ["VIZ"] = "1"
|
||||
os.environ["AMD_LLVM"] = "0"
|
||||
|
||||
|
|
@ -16,7 +16,7 @@ from tinygrad.device import Device, ProfileDeviceEvent
|
|||
|
||||
from extra.sqtt.roc import decode, InstExec, PrgExec
|
||||
|
||||
dev = Device["AMD"]
|
||||
dev = Device[os.environ["DEV"]]
|
||||
|
||||
def custom(arg:str, s:UOp|None=None) -> UOp: return UOp(Ops.CUSTOM, src=(s,) if s is not None else (), arg=arg)
|
||||
|
||||
|
|
@ -39,9 +39,10 @@ def save_sqtt():
|
|||
sqtt:dict[PrgExec, list[InstExec]] = {}
|
||||
yield sqtt
|
||||
# decode sqtt
|
||||
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
|
||||
assert len(rctx.inst_execs) > 0, "empty sqtt output"
|
||||
sqtt.update(rctx.inst_execs)
|
||||
if os.environ["DEV"] == "AMD":
|
||||
rctx = decode(dev.profile_events+[ProfileDeviceEvent("AMD", props=dev.device_props())])
|
||||
assert len(rctx.inst_execs) > 0, "empty sqtt output"
|
||||
sqtt.update(rctx.inst_execs)
|
||||
|
||||
class TestTiming(unittest.TestCase):
|
||||
def test_v_add(self):
|
||||
|
|
|
|||
|
|
@ -436,6 +436,12 @@ class TestVizProfiler(BaseTestViz):
|
|||
sz = len(get_profile(prof))
|
||||
self.assertLessEqual(sz/n_events, 26)
|
||||
|
||||
def test_calltrace(self):
|
||||
def fxn(): return Tensor.empty(10).mul(2).realize()
|
||||
fxn()
|
||||
trace = get_viz_list()[0]["steps"][0]["trace"]
|
||||
assert any(fxn.__code__.co_filename == f and fxn.__code__.co_firstlineno == l for f,l,*_ in trace), str(trace)
|
||||
|
||||
# can pack up to 1hr 11 min of trace events
|
||||
def test_trace_duration(self):
|
||||
dur_mins = 72
|
||||
|
|
|
|||
|
|
@ -240,11 +240,29 @@ class Profiling(contextlib.ContextDecorator):
|
|||
|
||||
def perf_counter_us() -> decimal.Decimal: return decimal.Decimal(time.perf_counter_ns())/1000
|
||||
|
||||
@functools.cache
|
||||
def lines(fn) -> list[str]:
|
||||
try:
|
||||
with open(fn, encoding="utf-8") as f: return f.readlines()
|
||||
except (FileNotFoundError, OSError): return []
|
||||
|
||||
def printable(loc:tuple[str, int]) -> str:
|
||||
try: return lines(loc[0])[loc[1]-1].strip()
|
||||
except IndexError: return "<missing>"
|
||||
|
||||
def get_stacktrace(frm, max_frames=30) -> tuple[tuple, ...]:
|
||||
ret:list[tuple] = []
|
||||
for i in range(max_frames):
|
||||
if (frm:=frm.f_back) is None: break
|
||||
ret.append(((fc:=frm.f_code).co_filename, frm.f_lineno, fc.co_name, printable((fc.co_filename, frm.f_lineno))))
|
||||
return tuple(ret)
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class TracingKey:
|
||||
display_name:str # display name of this trace event
|
||||
keys:tuple[Any, ...]=() # optional keys to search for related traces
|
||||
ret:Any=None
|
||||
tb:tuple[tuple, ...]|None=field(default_factory=lambda: get_stacktrace(sys._getframe(1)) if VIZ else None)
|
||||
|
||||
class ProfileEvent: pass
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from __future__ import annotations
|
||||
import os, ctypes, functools, mmap, struct, array, math, sys, weakref
|
||||
import os, ctypes, functools, mmap, struct, array, math, sys, weakref, contextlib
|
||||
assert sys.platform != 'win32'
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, cast
|
||||
|
|
@ -9,7 +9,8 @@ from tinygrad.runtime.support.hcq import FileIOInterface, MMIOInterface
|
|||
from tinygrad.runtime.autogen import kgsl, adreno
|
||||
from tinygrad.runtime.ops_cl import CLCompiler, CLDevice
|
||||
from tinygrad.renderer.cstyle import QCOMRenderer
|
||||
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport, cpu_profile, lo32, PROFILE, colored
|
||||
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport, cpu_profile, lo32, PROFILE
|
||||
from tinygrad.runtime.support.system import System
|
||||
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
|
||||
|
|
@ -350,9 +351,8 @@ class QCOMDevice(HCQCompiled):
|
|||
# a7xx start with 730x or 'Cxxx', a8xx starts 'Exxx'
|
||||
if self.gpu_id[:2] >= (7, 3): raise RuntimeError(f"Unsupported GPU: chip_id={info.chip_id:#x}")
|
||||
|
||||
if PROFILE and self.gpu_id[:2] < (7, 3) and int(FileIOInterface('/sys/class/kgsl/kgsl-3d0/idle_timer', os.O_RDONLY).read(), 0) < 4000000000:
|
||||
print(colored("WARNING: gpu can go into suspend mode and reset timestamps. "
|
||||
"Run 'echo \"4294947000\" | sudo tee /sys/class/kgsl/kgsl-3d0/idle_timer' to prevent idle state.", "yellow"))
|
||||
if PROFILE and self.gpu_id[:2] < (7, 3):
|
||||
System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", value="4000000000", msg="Failed to disable suspend mode", expected="4294967276")
|
||||
|
||||
compilers = [(QCOMRenderer, functools.partial(QCOMCompiler, device))]
|
||||
super().__init__(device, QCOMAllocator(self), compilers, functools.partial(QCOMProgram, self), QCOMSignal,
|
||||
|
|
@ -378,3 +378,7 @@ class QCOMDevice(HCQCompiled):
|
|||
self.synchronize()
|
||||
self._gpu_free(self._stack)
|
||||
self._stack = self._gpu_alloc(sz)
|
||||
|
||||
def _at_profile_finalize(self):
|
||||
super()._at_profile_finalize()
|
||||
with contextlib.suppress(RuntimeError): System.write_sysfs("/sys/class/kgsl/kgsl-3d0/idle_timer", "10", "Failed to reenable suspend mode")
|
||||
|
|
|
|||
|
|
@ -12,6 +12,11 @@ MAP_FIXED, MAP_LOCKED, MAP_POPULATE, MAP_NORESERVE = 0x10, 0 if OSX else 0x2000,
|
|||
class PCIBarInfo: addr:int; size:int # noqa: E702
|
||||
|
||||
class _System:
|
||||
def write_sysfs(self, path:str, value:str, msg:str, expected:str|None=None):
|
||||
if FileIOInterface(path, os.O_RDONLY).read().splitlines()[0] != (expected or value):
|
||||
os.system(cmd:=f"sudo sh -c 'echo {value} > {path}'")
|
||||
if FileIOInterface(path, os.O_RDONLY).read().splitlines()[0] != (expected or value): raise RuntimeError(f"{msg}. Please run {cmd} manually.")
|
||||
|
||||
@functools.cached_property
|
||||
def atomic_lib(self): return ctypes.CDLL(ctypes.util.find_library('atomic')) if sys.platform == "linux" else None
|
||||
|
||||
|
|
@ -26,9 +31,7 @@ class _System:
|
|||
|
||||
@functools.cached_property
|
||||
def pagemap(self) -> FileIOInterface:
|
||||
if FileIOInterface(reloc_sysfs:="/proc/sys/vm/compact_unevictable_allowed", os.O_RDONLY).read()[0] != "0":
|
||||
os.system(cmd:=f"sudo sh -c 'echo 0 > {reloc_sysfs}'")
|
||||
assert FileIOInterface(reloc_sysfs, os.O_RDONLY).read()[0] == "0", f"Failed to disable migration of locked pages. Please run {cmd} manually."
|
||||
self.write_sysfs("/proc/sys/vm/compact_unevictable_allowed", "0", "Failed to disable migration of locked pages")
|
||||
return FileIOInterface("/proc/self/pagemap", os.O_RDONLY)
|
||||
|
||||
@functools.cached_property
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from tinygrad.mixin import OpMixin
|
|||
from tinygrad.dtype import ConstType, ImageDType, dtypes, DType, truncate, PtrDType, least_upper_dtype, Invalid, InvalidType, AddrSpace
|
||||
from tinygrad.helpers import ContextVar, all_int, prod, getenv, all_same, Context, partition, temp, unwrap, T, argfix, Metadata, flatten, TRACEMETA
|
||||
from tinygrad.helpers import PICKLE_BUFFERS, PROFILE, dedup, cdiv, cmod, diskcache_put, to_function_name, cpu_profile, TracingKey, VIZ, SPEC, CI
|
||||
from tinygrad.helpers import strip_parens, colored, ansilen
|
||||
from tinygrad.helpers import strip_parens, colored, ansilen, printable
|
||||
if TYPE_CHECKING:
|
||||
from tinygrad.device import Buffer, MultiBuffer
|
||||
|
||||
|
|
@ -865,14 +865,6 @@ def get_location() -> tuple[str, int]:
|
|||
frm = frm.f_back
|
||||
return frm.f_code.co_filename, frm.f_lineno
|
||||
|
||||
@functools.cache
|
||||
def lines(fn) -> list[str]:
|
||||
with open(fn) as f: return f.readlines()
|
||||
|
||||
def printable(loc:tuple[str, int]) -> str:
|
||||
try: return lines(loc[0])[loc[1]-1].strip()
|
||||
except FileNotFoundError: return "<missing>"
|
||||
|
||||
class UPat(OpMixin):
|
||||
__slots__ = ("op", "dtype", "arg", "name", "src")
|
||||
def __init__(self, op:Ops|tuple[Ops, ...]|set[Ops]|None=None, dtype:DType|tuple[DType, ...]|None=None,
|
||||
|
|
|
|||
|
|
@ -551,6 +551,7 @@ document.getElementById("zoom-to-fit-btn").addEventListener("click", () => {
|
|||
|
||||
// **** main VIZ interfacae
|
||||
|
||||
const pathLink = (fp, lineno) => d3.create("a").attr("href", "vscode://file/"+fp+":"+lineno).text(`${fp.split("/").at(-1)}:${lineno}`);
|
||||
function codeBlock(st, language, { loc, wrap }={}) {
|
||||
const code = document.createElement("code");
|
||||
// plaintext renders like a terminal print, otherwise render with syntax highlighting
|
||||
|
|
@ -559,11 +560,7 @@ function codeBlock(st, language, { loc, wrap }={}) {
|
|||
code.className = "hljs";
|
||||
const ret = document.createElement("pre");
|
||||
if (wrap) ret.className = "wrap";
|
||||
if (loc != null) {
|
||||
const link = ret.appendChild(document.createElement("a"));
|
||||
link.href = "vscode://file/"+loc.join(":");
|
||||
link.textContent = `${loc[0].split("/").at(-1)}:${loc[1]}`+"\n\n";
|
||||
}
|
||||
if (loc != null) ret.appendChild(pathLink(loc[0], loc[1]).style("margin-bottom", "4px").node());
|
||||
ret.appendChild(code);
|
||||
return ret;
|
||||
}
|
||||
|
|
@ -763,6 +760,15 @@ async function main() {
|
|||
// ** right sidebar code blocks
|
||||
const codeElement = codeBlock(ret[currentRewrite].uop, "python", { wrap:false });
|
||||
metadata.replaceChildren(toggleLabel, codeBlock(step.code_line, "python", { loc:step.loc, wrap:true }), codeElement);
|
||||
if (step.trace) {
|
||||
const trace = d3.create("pre").append("code").classed("hljs", true);
|
||||
for (let i=step.trace.length-1; i>=0; i--) {
|
||||
const [fp, lineno, fn, code] = step.trace[i];
|
||||
trace.append("div").style("margin-bottom", "2px").style("display","flex").text(fn+" ").append(() => pathLink(fp, lineno).node());
|
||||
trace.append("div").html(hljs.highlight(code, { language: "python" }).value).style("margin-bottom", "1ex");
|
||||
}
|
||||
metadata.insertBefore(trace.node().parentNode, codeElement);
|
||||
}
|
||||
// ** rewrite steps
|
||||
if (step.match_count >= 1) {
|
||||
const rewriteList = metadata.appendChild(document.createElement("div"));
|
||||
|
|
|
|||
|
|
@ -7,7 +7,8 @@ from http.server import BaseHTTPRequestHandler
|
|||
from urllib.parse import parse_qs, urlparse
|
||||
from typing import Any, TypedDict, TypeVar, Generator, Callable
|
||||
from tinygrad.helpers import colored, getenv, tqdm, unwrap, word_wrap, TRACEMETA, ProfileEvent, ProfileRangeEvent, TracingKey, ProfilePointEvent, temp
|
||||
from tinygrad.uop.ops import TrackedGraphRewrite, RewriteTrace, UOp, Ops, printable, GroupOp, srender, sint, sym_infer, range_str, pyrender
|
||||
from tinygrad.helpers import printable
|
||||
from tinygrad.uop.ops import TrackedGraphRewrite, RewriteTrace, UOp, Ops, GroupOp, srender, sint, sym_infer, range_str, pyrender
|
||||
from tinygrad.uop.ops import print_uops, range_start, multirange_str
|
||||
from tinygrad.device import ProfileDeviceEvent, ProfileGraphEvent, ProfileGraphEntry, Device
|
||||
from tinygrad.renderer import ProgramSpec
|
||||
|
|
@ -30,7 +31,7 @@ ref_map:dict[Any, int] = {}
|
|||
def get_rewrites(t:RewriteTrace) -> list[dict]:
|
||||
ret = []
|
||||
for i,(k,v) in enumerate(zip(t.keys, t.rewrites)):
|
||||
steps = [{"name":s.name, "loc":s.loc, "match_count":len(s.matches), "code_line":printable(s.loc),
|
||||
steps = [{"name":s.name, "loc":s.loc, "match_count":len(s.matches), "code_line":printable(s.loc), "trace":k.tb if j == 0 else None,
|
||||
"query":f"/ctxs?ctx={i}&idx={j}", "depth":s.depth} for j,s in enumerate(v)]
|
||||
if isinstance(k.ret, ProgramSpec):
|
||||
steps.append({"name":"View UOp List", "query":f"/render?ctx={i}&fmt=uops", "depth":0})
|
||||
|
|
@ -216,12 +217,23 @@ def load_sqtt(profile:list[ProfileEvent]) -> None:
|
|||
if (r:=ref_map.get(name)): name = ctxs[r]["name"]
|
||||
steps.append({"name":name, "depth":0, "query":f"/render?ctx={len(ctxs)}&step={len(steps)}&fmt=counters",
|
||||
"data":{"src":trace.keys[r].ret.src if r else name, "lang":"cpp"}})
|
||||
|
||||
# Idle: The total time gap between the completion of previous instruction and the beginning of the current instruction.
|
||||
# The idle time can be caused by:
|
||||
# * Arbiter loss
|
||||
# * Source or destination register dependency
|
||||
# * Instruction cache miss
|
||||
# Stall: The total number of cycles the hardware pipe couldn't issue an instruction.
|
||||
# Duration: Total latency in cycles, defined as "Stall time + Issue time" for gfx9 or "Stall time + Execute time" for gfx10+.
|
||||
for w in waves:
|
||||
rows = [(e.inst, e.time, e.time-(w.insts[i-1].time if i else 0), e.dur, e.stall, str(e.typ).split("_")[-1]) for i,e in enumerate(w.insts)]
|
||||
summary = [{"label":"Total Cycles", "value":w.insts[-1].time-w.insts[0].time if w.insts else 0}, {"label":"CU", "value":w.cu},
|
||||
rows, prev_instr = [], w.begin_time
|
||||
for i,e in enumerate(w.insts):
|
||||
rows.append((e.inst, e.time, max(0, e.time-prev_instr), e.dur, e.stall, str(e.typ).split("_")[-1]))
|
||||
prev_instr = max(prev_instr, e.time + e.dur)
|
||||
summary = [{"label":"Total Cycles", "value":w.end_time-w.begin_time}, {"label":"CU", "value":w.cu},
|
||||
{"label":"SIMD", "value":w.simd}]
|
||||
steps.append({"name":f"Wave {w.wave_id}", "depth":1, "query":f"/render?ctx={len(ctxs)}&step={len(steps)}&fmt=counters",
|
||||
"data":{"rows":rows, "cols":["Instruction", "Clk", "Wait", "Duration", "Stall", "Type"], "summary":summary}})
|
||||
"data":{"rows":rows, "cols":["Instruction", "Clk", "Idle", "Duration", "Stall", "Type"], "summary":summary}})
|
||||
ctxs.append({"name":"Counters", "steps":steps})
|
||||
|
||||
def get_profile(profile:list[ProfileEvent]) -> bytes|None:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue