renderer/amd: move in tree (#14702)

* renderer/amd: move in tree

* fix paths in tests

* 24000 lines

* no delete for amd files
This commit is contained in:
George Hotz 2026-02-12 18:09:16 +08:00 committed by GitHub
commit 4680247e35
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
71 changed files with 266 additions and 253 deletions

View file

@ -43,7 +43,7 @@ jobs:
run: sudo apt-get install -y --no-install-recommends libclang-20-dev llvm-20-dev hip-dev libusb-1.0-0-dev libdrm-dev
- name: Regenerate autogen files
run: |
find tinygrad/runtime/autogen -type f -name "*.py" -not -name "__init__.py" -not -name "comgr_3.py" -not -name "metal.py" -not -name "iokit.py" -not -name "corefoundation.py" -not -name "libclang.py" -delete
find tinygrad/runtime/autogen -type f -name "*.py" -not -path "*/amd/*" -not -name "__init__.py" -not -name "comgr_3.py" -not -name "metal.py" -not -name "iokit.py" -not -name "corefoundation.py" -not -name "libclang.py" -delete
python3 -c "from tinygrad.runtime.autogen import opencl"
python3 -c "from tinygrad.runtime.autogen import cuda, nvrtc, nvjitlink, nv_570, nv_580, nv"
python3 -c "from tinygrad.runtime.autogen import comgr, hsa, hip, amd_gpu, sqtt, rocprof, amdgpu_kd, amdgpu_drm"

View file

@ -295,8 +295,8 @@ jobs:
python extra/optimization/extract_dataset.py
gzip -c /tmp/sops > extra/datasets/sops.gz
#DEBUG=1 MIN_ASTS=1 python extra/optimization/get_action_space.py
- name: Repo line count < 20000 lines
run: MAX_LINE_COUNT=20000 python sz.py
- name: Repo line count < 24000 lines
run: MAX_LINE_COUNT=24000 python sz.py
spec:
strategy:
@ -683,8 +683,8 @@ jobs:
python-version: '3.14'
- name: Verify AMD autogen is up to date
run: |
python -m extra.assembly.amd.generate
git diff --exit-code extra/assembly/amd/autogen/
python -m tinygrad.renderer.amd.generate
git diff --exit-code tinygrad/runtime/autogen/amd/
- name: Install LLVM 21
run: |
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
@ -692,13 +692,13 @@ jobs:
sudo apt-get update
sudo apt-get install llvm-21 llvm-21-tools cloc
- name: RDNA3 Line Count
run: cloc --by-file extra/assembly/amd/*.py
run: cloc --by-file tinygrad/renderer/amd/*.py
- name: Install rocprof-trace-decoder
run: sudo PYTHONPATH="." ./extra/sqtt/install_sqtt_decoder.py
- name: Run RDNA3 emulator tests
run: AMD_LLVM=0 python -m pytest -n=auto extra/assembly/amd/ --durations 20
run: AMD_LLVM=0 python -m pytest -n=auto test/amd/ --durations 20
- name: Run RDNA3 emulator tests (AMD_LLVM=1)
run: AMD_LLVM=1 python -m pytest -n=auto extra/assembly/amd/ --durations 20
run: AMD_LLVM=1 python -m pytest -n=auto test/amd/ --durations 20
- name: Run RDNA3 dtype tests
run: AMD_LLVM=0 pytest -n=auto test/backend/test_dtype_alu.py test/backend/test_dtype.py --durations 20
- name: Run RDNA3 dtype tests (AMD_LLVM=1)

View file

@ -76,12 +76,12 @@ VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()"
## Auto-generated Files (DO NOT EDIT)
The following files are auto-generated and should never be edited manually:
- `extra/assembly/amd/autogen/{arch}/__init__.py` - Generated by `python -m extra.assembly.amd.dsl --arch {arch}`
- `extra/assembly/amd/autogen/{arch}/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch {arch}`
- `tinygrad/runtime/autogen/amd/{arch}/__init__.py` - Generated by `python -m tinygrad.renderer.amd.dsl --arch {arch}`
- `tinygrad/runtime/autogen/amd/{arch}/gen_pcode.py` - Generated by `python -m tinygrad.renderer.amd.pcode --arch {arch}`
Where `{arch}` is one of: `rdna3`, `rdna4`, `cdna`
To add missing instruction implementations, add them to `extra/assembly/amd/emu.py` instead.
To add missing instruction implementations, add them to `tinygrad/renderer/amd/emu.py` instead.
## Style Notes

View file

@ -1,67 +0,0 @@
# Instruction format detection and decoding
from __future__ import annotations
from extra.assembly.amd.dsl import Inst, FixedBitField, EnumBitField
# SDWA/DPP variant detection: src0 field (bits 0-8) encodes the variant
# 0xf9 (249) = SDWA, 0xfa (250) = DPP16 for CDNA (GFX9)
_VARIANT_SRC0 = {"_SDWA_SDST": 0xf9, "_SDWA": 0xf9, "_DPP16": 0xfa}
def _matches(data: bytes, cls: type[Inst]) -> bool:
"""Check if data matches all FixedBitFields and op is in allowed."""
for _, field in cls._fields:
dword_idx = field.lo // 32
if len(data) < (dword_idx + 1) * 4: return False
word = int.from_bytes(data[dword_idx*4:(dword_idx+1)*4], 'little')
field_lo = field.lo % 32
if isinstance(field, FixedBitField):
if ((word >> field_lo) & field.mask) != field.default: return False
if isinstance(field, EnumBitField) and field.allowed is not None:
try: opcode = field.decode((word >> field_lo) & field.mask)
except ValueError: return False # opcode not in enum
if opcode not in field.allowed: return False
# Check SDWA/DPP variant based on src0 field (bits 0-8) - only for variant classes
name = cls.__name__
word = int.from_bytes(data[:4], 'little')
for suffix, expected_src0 in _VARIANT_SRC0.items():
if name.endswith(suffix): return (word & 0x1ff) == expected_src0
return True
# Import instruction classes for each architecture
from extra.assembly.amd.autogen.rdna3.ins import (VOP1, VOP1_SDST, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3SD, VOP3P, VOPC, VOPD, VINTERP,
SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, SMEM, DS, FLAT, GLOBAL, SCRATCH)
from extra.assembly.amd.autogen.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST, VOP1_LIT as R4_VOP1_LIT,
VOP2 as R4_VOP2, VOP2_LIT as R4_VOP2_LIT, VOP3 as R4_VOP3, VOP3_SDST as R4_VOP3_SDST, VOP3SD as R4_VOP3SD, VOP3P as R4_VOP3P,
VOPC as R4_VOPC, VOPD as R4_VOPD, VINTERP as R4_VINTERP, SOP1 as R4_SOP1, SOP1_LIT as R4_SOP1_LIT,
SOP2 as R4_SOP2, SOP2_LIT as R4_SOP2_LIT, SOPC as R4_SOPC, SOPC_LIT as R4_SOPC_LIT,
SOPK as R4_SOPK, SOPK_LIT as R4_SOPK_LIT, SOPP as R4_SOPP,
SMEM as R4_SMEM, DS as R4_DS, VFLAT as R4_FLAT, VGLOBAL as R4_GLOBAL, VSCRATCH as R4_SCRATCH)
from extra.assembly.amd.autogen.cdna.ins import (VOP1 as C_VOP1, VOP1_SDWA as C_VOP1_SDWA, VOP1_DPP16 as C_VOP1_DPP16,
VOP2 as C_VOP2, VOP2_LIT as C_VOP2_LIT, VOP2_SDWA as C_VOP2_SDWA, VOP2_DPP16 as C_VOP2_DPP16,
VOPC as C_VOPC, VOPC_SDWA_SDST as C_VOPC_SDWA_SDST,
VOP3 as C_VOP3, VOP3_SDST as C_VOP3_SDST, VOP3SD as C_VOP3SD, VOP3P as C_VOP3P, VOP3P_MFMA as C_VOP3P_MFMA, VOP3PX2 as C_VOP3PX2,
SOP1 as C_SOP1, SOP2 as C_SOP2, SOPC as C_SOPC, SOPK as C_SOPK, SOPK_LIT as C_SOPK_LIT, SOPP as C_SOPP, SMEM as C_SMEM, DS as C_DS,
FLAT as C_FLAT, GLOBAL as C_GLOBAL, SCRATCH as C_SCRATCH, MUBUF as C_MUBUF)
# Order matters: more specific encodings first, catch-alls (SOP2, VOP2) last
# Order: base before _LIT (base matches regular ops, _LIT catches lit-only ops excluded from base)
_FORMATS = {
"rdna3": [VOPD, VOP3P, VINTERP, VOP3SD, VOP3_SDST, VOP3, DS, GLOBAL, SCRATCH, FLAT, SMEM,
SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, VOPC, VOP1_SDST, VOP1, VOP1_LIT, VOP2, VOP2_LIT],
"rdna4": [R4_VOPD, R4_VOP3P, R4_VINTERP, R4_VOP3SD, R4_VOP3_SDST, R4_VOP3, R4_DS, R4_GLOBAL, R4_SCRATCH, R4_FLAT, R4_SMEM,
R4_SOP1, R4_SOP1_LIT, R4_SOPC, R4_SOPC_LIT, R4_SOPP, R4_SOPK, R4_SOPK_LIT, R4_VOPC, R4_VOP1_SDST, R4_VOP1, R4_VOP1_LIT,
R4_SOP2, R4_SOP2_LIT, R4_VOP2, R4_VOP2_LIT],
"cdna": [C_VOP3PX2, C_VOP3P_MFMA, C_VOP3P, C_VOP3SD, C_VOP3_SDST, C_VOP3, C_DS, C_GLOBAL, C_SCRATCH, C_FLAT, C_MUBUF, C_SMEM,
C_SOP1, C_SOPC, C_SOPP, C_SOPK, C_SOPK_LIT, C_VOPC_SDWA_SDST, C_VOPC,
C_VOP1_DPP16, C_VOP1_SDWA, C_VOP1, C_VOP2_DPP16, C_VOP2_SDWA, C_SOP2, C_VOP2, C_VOP2_LIT],
}
def detect_format(data: bytes, arch: str = "rdna3") -> type[Inst]:
"""Detect instruction format from machine code bytes."""
assert len(data) >= 4, f"need at least 4 bytes, got {len(data)}"
for cls in _FORMATS[arch]:
if _matches(data, cls): return cls
raise ValueError(f"unknown {arch} format word={int.from_bytes(data[:4], 'little'):#010x}")
def decode_inst(data: bytes, arch: str = "rdna3") -> Inst:
"""Decode machine code bytes into an instruction."""
return detect_format(data, arch).from_bytes(data)

View file

@ -14,8 +14,8 @@ from tinygrad import Tensor, Device, Context, GlobalCounters
from tinygrad.uop.ops import UOp, Ops, KernelInfo
from tinygrad.helpers import getenv, colored
from tinygrad.engine.realize import Estimates
from extra.assembly.amd.dsl import s, v, VCC_LO, NULL
from extra.assembly.amd.autogen.rdna3.ins import *
from tinygrad.renderer.amd.dsl import s, v, VCC_LO, NULL
from tinygrad.runtime.autogen.amd.rdna3.ins import *
# =============================================================================
# Kernel constants
@ -192,7 +192,7 @@ class Kernel:
inst.simm16 = offset_dwords
# TODO: replace this with direct ELF
from extra.assembly.amd.test.disasm import disasm
from test.amd.disasm import disasm
body = ['\t' + disasm(inst) for inst in self.instructions]
# limit wave occupancy by using more LDS

View file

@ -1,4 +1,4 @@
from extra.assembly.amd.autogen.cdna.ins import *
from tinygrad.runtime.autogen.amd.cdna.ins import *
from tinygrad.dtype import dtypes
# M0 is encoded with 124 (NULL in RDNA) in CDNA
@ -73,7 +73,7 @@ class Kernel:
lines, pos = [], 0
for inst in self.instructions:
if (label := self.label_at_pos.get(pos)) is not None: lines.append(f"{label}:")
from extra.assembly.amd.test.disasm import disasm
from test.amd.disasm import disasm
lines.append(f" {disasm(inst)}" if inst._target is None else f" {inst.op_name.lower()} {inst._target}")
pos += inst.size()
return "\n".join(lines)

View file

@ -5,7 +5,7 @@ os.environ["AMD_AQL"] = "1"
from tinygrad.device import Device
from tinygrad.runtime.support.compiler_amd import HIPCompiler
from extra.assembly.amd.dsl import Reg, Inst, s, v
from tinygrad.renderer.amd.dsl import Reg, Inst, s, v
NUM_WORKGROUPS = 96
WAVE_SIZE = 32
@ -51,7 +51,7 @@ if __name__=="__main__":
COMPILER = HIPCompiler(arch)
if arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
from extra.assembly.amd.autogen.rdna3.ins import *
from tinygrad.runtime.autogen.amd.rdna3.ins import *
if arch == 'gfx1103': NUM_WORKGROUPS = 8
if arch == 'gfx1151': NUM_WORKGROUPS = 32
launchBenchmark(v_wmma_bf16_16x16x16_bf16, (7,8,15))
@ -61,7 +61,7 @@ if __name__=="__main__":
launchBenchmark(v_wmma_i32_16x16x16_iu4, (7,8,9))
launchBenchmark(v_wmma_i32_16x16x16_iu8, (7,8,11))
elif arch in {'gfx1200', 'gfx1201'}:
from extra.assembly.amd.autogen.rdna4.ins import *
from tinygrad.runtime.autogen.amd.rdna4.ins import *
# this instruction does not exist in the rdna4 isa, use the co version
s_sub_u32 = s_sub_co_u32
NUM_WORKGROUPS = 64
@ -90,7 +90,7 @@ if __name__=="__main__":
FLOPS_PER_MATMUL = 16*16*64*2
launchBenchmark(v_swmmac_i32_16x16x64_iu4, (7,8,9,10,13,14), False)
elif arch == 'gfx950':
from extra.assembly.amd.autogen.cdna.ins import *
from tinygrad.runtime.autogen.amd.cdna.ins import *
DIRECTIVE = ".amdhsa_accum_offset 4"
NUM_WORKGROUPS = 256
WAVE_SIZE = 64

View file

@ -3,7 +3,7 @@ from __future__ import annotations
import enum, collections
from typing import Iterator
from tinygrad.helpers import colored
from extra.assembly.amd.sqtt import PacketType, bits
from tinygrad.renderer.amd.sqtt import PacketType, bits
# ═══════════════════════════════════════════════════════════════════════════════
# STALL REASONS

View file

@ -7,8 +7,8 @@ import subprocess, struct, math, functools
from tinygrad import Tensor, dtypes, Device
from tinygrad.helpers import getenv
from extra.assembly.amd.autogen.rdna3.ins import *
from extra.assembly.amd.asm import waitcnt
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from tinygrad.renderer.amd.asm import waitcnt
from test.testextra.test_cfg_viz import asm_kernel

View file

@ -27,9 +27,14 @@ packages = [
'tinygrad.mixin',
'tinygrad.nn',
'tinygrad.renderer',
'tinygrad.renderer.amd',
'tinygrad.runtime',
'tinygrad.runtime.autogen',
'tinygrad.runtime.autogen.am',
'tinygrad.runtime.autogen.amd',
'tinygrad.runtime.autogen.amd.rdna3',
'tinygrad.runtime.autogen.amd.rdna4',
'tinygrad.runtime.autogen.amd.cdna',
'tinygrad.runtime.graph',
'tinygrad.runtime.support',
'tinygrad.runtime.support.am',
@ -128,7 +133,7 @@ debug = true
[tool.mypy]
warn_unused_configs = true
files = ["tinygrad", "extra/assembly/amd"]
files = ["tinygrad"]
ignore_missing_imports = true
check_untyped_defs = true
explicit_package_bases = true
@ -142,9 +147,7 @@ strict_equality = true
module = "extra.*"
follow_imports = "skip"
[[tool.mypy.overrides]]
module = "extra.assembly.amd.*"
follow_imports = "normal"
[tool.pytest.ini_options]
norecursedirs = [
@ -184,7 +187,6 @@ exclude = [
".git/",
"docs/",
"extra/",
"!extra/assembly/amd/",
"test/external/mlperf_resnet",
"test/external/mlperf_unet3d",
]
@ -250,8 +252,8 @@ select = [
"F841",
]
"tinygrad/runtime/autogen/**/*.py" = ["E501", "F401", "E722", "E731", "F821", "A006", "A002", "F811"]
"extra/assembly/amd/autogen/**/*.py" = ["E501"]
"extra/assembly/amd/test/**/*.py" = ["F403", "F405"]
"tinygrad/runtime/autogen/amd/**/*.py" = ["E501"]
"test/amd/**/*.py" = ["F403", "F405"]
[tool.ruff.format]
exclude = ["*"]

View file

@ -6,13 +6,15 @@ from pathlib import Path
# Set AMD=1 before importing tinygrad
os.environ["AMD"] = "1"
from extra.assembly.amd.emu import run_asm as python_run_asm, decode_program
from extra.assembly.amd import decode_inst
from extra.assembly.amd.autogen.rdna3.ins import SOPP, SOPPOp
from tinygrad.renderer.amd.emu import run_asm as python_run_asm, decode_program
from tinygrad.renderer.amd import decode_inst
from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP, SOPPOp
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so"
import tinygrad
EXTRA_DIR = Path(tinygrad.__file__).parent.parent / "extra"
REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.so"
if not REMU_PATH.exists():
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.dylib"
REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.dylib"
def get_rust_remu():
"""Load the Rust libremu shared library."""
@ -68,7 +70,7 @@ def benchmark_emulator(name: str, run_fn, kernel: bytes, global_size, local_size
def profile_instructions(kernel: bytes):
"""Profile individual instruction compile times."""
from extra.assembly.amd.emu import _get_runner, _canonical_runner_cache
from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
from tinygrad.helpers import Context
_get_runner.cache_clear()
_canonical_runner_cache.clear()
@ -98,7 +100,7 @@ def profile_instructions(kernel: bytes):
def benchmark_python_split(kernel: bytes, global_size, local_size, args_ptr, rsrc2: int, iterations: int = 5):
"""Benchmark Python emulator with compile and execution times."""
from extra.assembly.amd.emu import _get_runner, _canonical_runner_cache
from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
from tinygrad.helpers import Context
_get_runner.cache_clear()
_canonical_runner_cache.clear()

View file

@ -2,7 +2,7 @@
from __future__ import annotations
import re
from typing import Callable
from extra.assembly.amd.dsl import Inst, Reg
from tinygrad.renderer.amd.dsl import Inst, Reg
# Special register mappings for disassembly
SPECIAL_GPRS = {106: 'vcc_lo', 107: 'vcc_hi', 124: 'null', 125: 'm0', 126: 'exec_lo', 127: 'exec_hi',
@ -81,17 +81,17 @@ def _num_srcs(inst) -> int:
# IMPORTS
# ═══════════════════════════════════════════════════════════════════════════════
from extra.assembly.amd.autogen.rdna3.ins import (VOP1, VOP1_SDST, VOP1_SDST_LIT, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3_SDST_LIT,
from tinygrad.runtime.autogen.amd.rdna3.ins import (VOP1, VOP1_SDST, VOP1_SDST_LIT, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3_SDST_LIT,
VOP3_LIT, VOP3SD, VOP3SD_LIT, VOP3P, VOP3P_LIT, VOPC, VOPC_LIT, VOPD, VOPD_LIT, VINTERP, SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPC_LIT,
SOPK, SOPK_LIT, SOPP, SMEM, DS, FLAT, GLOBAL, SCRATCH, VOP2Op, VOPDOp, SOPPOp, HWREG, MSG)
from extra.assembly.amd.autogen.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST,
from tinygrad.runtime.autogen.amd.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST,
VOP1_SDST_LIT as R4_VOP1_SDST_LIT, VOP1_LIT as R4_VOP1_LIT,
VOP2 as R4_VOP2, VOP2_LIT as R4_VOP2_LIT, VOP3 as R4_VOP3, VOP3_SDST as R4_VOP3_SDST, VOP3_SDST_LIT as R4_VOP3_SDST_LIT, VOP3_LIT as R4_VOP3_LIT,
VOP3SD as R4_VOP3SD, VOP3SD_LIT as R4_VOP3SD_LIT, VOP3P as R4_VOP3P, VOP3P_LIT as R4_VOP3P_LIT, VOPC as R4_VOPC, VOPC_LIT as R4_VOPC_LIT,
VOPD as R4_VOPD, VOPD_LIT as R4_VOPD_LIT, VINTERP as R4_VINTERP, SOP1 as R4_SOP1, SOP1_LIT as R4_SOP1_LIT, SOP2 as R4_SOP2, SOP2_LIT as R4_SOP2_LIT,
SOPC as R4_SOPC, SOPC_LIT as R4_SOPC_LIT, SOPK as R4_SOPK, SOPK_LIT as R4_SOPK_LIT, SOPP as R4_SOPP, SMEM as R4_SMEM, DS as R4_DS,
VOPDOp as R4_VOPDOp, HWREG as HWREG_RDNA4, VFLAT as R4_FLAT, VGLOBAL as R4_GLOBAL, VSCRATCH as R4_SCRATCH)
from extra.assembly.amd.autogen.cdna.ins import HWREG as HWREG_CDNA
from tinygrad.runtime.autogen.amd.cdna.ins import HWREG as HWREG_CDNA
def _is_cdna(inst: Inst) -> bool: return 'cdna' in inst.__class__.__module__
def _is_r4(inst: Inst) -> bool: return 'rdna4' in inst.__class__.__module__
@ -631,7 +631,7 @@ def disasm(inst: Inst) -> str: return DISASM_HANDLERS[type(inst)](inst)
# CDNA DISASSEMBLER SUPPORT
# ═══════════════════════════════════════════════════════════════════════════════
from extra.assembly.amd.autogen.cdna.ins import (VOP1 as CDNA_VOP1, VOP1_LIT as CDNA_VOP1_LIT,
from tinygrad.runtime.autogen.amd.cdna.ins import (VOP1 as CDNA_VOP1, VOP1_LIT as CDNA_VOP1_LIT,
VOP1_SDWA as CDNA_VOP1_SDWA, VOP1_DPP16 as CDNA_VOP1_DPP16,
VOP2 as CDNA_VOP2, VOP2_LIT as CDNA_VOP2_LIT, VOP2_SDWA as CDNA_VOP2_SDWA, VOP2_DPP16 as CDNA_VOP2_DPP16,
VOPC as CDNA_VOPC, VOPC_LIT as CDNA_VOPC_LIT, VOPC_SDWA_SDST as CDNA_VOPC_SDWA_SDST,

View file

@ -4,10 +4,10 @@ Uses run_asm() with memory output, so tests can run on both emulator and real ha
Set USE_HW=1 to run on both emulator and hardware, comparing results.
"""
import ctypes, math, os, struct
from extra.assembly.amd.autogen.rdna3.ins import *
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from extra.assembly.amd.emu import run_asm
from extra.assembly.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0
from tinygrad.renderer.amd.emu import run_asm
from tinygrad.renderer.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0
def _i32(f: float) -> int: return struct.unpack('<I', struct.pack('<f', f))[0]
def _f32(i: int) -> float: return struct.unpack('<f', struct.pack('<I', i & 0xFFFFFFFF))[0]

View file

@ -5,7 +5,7 @@ Includes: ds_store_b32, ds_load_b32, ds_store_2addr_*, ds_load_2addr_*,
ds_inc_*, ds_dec_*, ds_cmpstore_*, ds_storexchg_*
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestDS2Addr(unittest.TestCase):
"""Tests for DS_*_2ADDR instructions."""

View file

@ -3,7 +3,7 @@
Includes: flat_load_*, flat_store_*, flat_atomic_*
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestFlatAtomic(unittest.TestCase):
"""Tests for FLAT atomic instructions."""

View file

@ -3,7 +3,7 @@
Includes: global_load_*, global_store_*, global_atomic_*, global_load_d16_*
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestGlobalAtomic(unittest.TestCase):
"""Tests for GLOBAL atomic instructions."""

View file

@ -3,7 +3,7 @@
Includes: scratch_load_*, scratch_store_*
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestScratchStore(unittest.TestCase):
"""Tests for SCRATCH store instructions."""

View file

@ -4,7 +4,7 @@ Includes: s_load_b32, s_load_b64, s_load_b128, s_load_b256, s_load_b512
Tests both immediate and register offset addressing modes.
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
# Use offset into output buffer for test data (output buffer is 2124 bytes)
TEST_OFFSET = 2000

View file

@ -4,7 +4,7 @@ Includes: s_add_u32, s_mov_b32, s_and_b32, s_or_b32, s_quadmask_b32, s_wqm_b32,
s_cbranch_vccnz, s_cbranch_vccz
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestBasicScalar(unittest.TestCase):
"""Tests for basic scalar operations."""

View file

@ -5,7 +5,7 @@ Includes: v_mov_b32, v_cvt_*, v_sin_f32, v_rcp_f32, v_exp_f32, v_rndne_f32,
v_readfirstlane_b32
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestMov(unittest.TestCase):
"""Tests for V_MOV_B32."""

View file

@ -5,7 +5,7 @@ Includes: v_add_f32, v_mul_f32, v_and_b32, v_or_b32, v_xor_b32,
v_add_nc_u32, v_cndmask_b32, v_add_f16, v_mul_f16
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestBasicArithmetic(unittest.TestCase):
"""Tests for basic arithmetic VOP2 instructions."""

View file

@ -4,7 +4,7 @@ Includes: v_fma_f32, v_div_scale_f32, v_div_fmas_f32, v_div_fixup_f32,
v_alignbit_b32, v_bfe_i32, v_mad_u64_u32, v_readlane_b32, v_writelane_b32
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestFMA(unittest.TestCase):
"""Tests for FMA instructions."""
@ -2754,7 +2754,7 @@ class TestVOP3VOPC(unittest.TestCase):
def test_v_cmp_ge_f32_e64_nan(self):
"""V_CMP_GE_F32_E64: |NaN| >= |0.0| should be FALSE (NaN comparisons always false)."""
from extra.assembly.amd.autogen.rdna3.ins import VOP3_SDST
from tinygrad.runtime.autogen.amd.rdna3.ins import VOP3_SDST
instructions = [
s_mov_b32(s[0], 0xffc00000), # NaN
s_mov_b32(s[1], 0x00000000), # 0.0

View file

@ -3,7 +3,7 @@
Includes: v_pk_add_f16, v_pk_mul_f16, v_pk_fma_f16, v_pack_b32_f16, v_wmma_*, v_dot2_*
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
class TestPackInstructions(unittest.TestCase):
"""Tests for pack instructions."""

View file

@ -3,7 +3,7 @@
Includes: v_cmp_class_f32, v_cmp_class_f16, v_cmp_eq_*, v_cmp_lt_*, v_cmp_gt_*
"""
import unittest
from extra.assembly.amd.test.hw.helpers import *
from test.amd.hw.helpers import *
VCC = 106 # SGPR index for VCC_LO

View file

@ -7,8 +7,8 @@ VOPD executes two operations simultaneously. Key behavior:
- Op Y can use ops 0-18 (includes ADD_NC_U32, LSHLREV, AND)
"""
import unittest
from extra.assembly.amd.test.hw.helpers import run_program, v, v_mov_b32_e32
from extra.assembly.amd.autogen.rdna3.ins import VOPD, VOPD_LIT, VOPDOp
from test.amd.hw.helpers import run_program, v, v_mov_b32_e32
from tinygrad.runtime.autogen.amd.rdna3.ins import VOPD, VOPD_LIT, VOPDOp
class TestVOPDBasic(unittest.TestCase):
"""Basic VOPD functionality tests."""
@ -108,7 +108,7 @@ class TestVOPDLiterals(unittest.TestCase):
Tests that the 32-bit literal (SIMM32) is correctly passed to the instruction.
fma(2.0, 3.0, 10.0) = 2*3 + 10 = 16.0
"""
from extra.assembly.amd.test.hw.helpers import f2i, i2f
from test.amd.hw.helpers import f2i, i2f
instructions = [
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0
@ -126,7 +126,7 @@ class TestVOPDLiterals(unittest.TestCase):
Tests that the 32-bit literal (SIMM32) is correctly used as the multiplier.
fma(2.0, 5.0, 3.0) = 2*5 + 3 = 13.0
"""
from extra.assembly.amd.test.hw.helpers import f2i, i2f
from test.amd.hw.helpers import f2i, i2f
instructions = [
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0

View file

@ -2,10 +2,10 @@
import unittest, ctypes
from dataclasses import dataclass
from extra.assembly.amd.emu import WaveState, decode_program, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
from extra.assembly.amd import decode_inst
from extra.assembly.amd.test.helpers import KernelInfo
from extra.assembly.amd.test.bench_emu import REMU_PATH
from tinygrad.renderer.amd.emu import WaveState, decode_program, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
from tinygrad.renderer.amd import decode_inst
from test.amd.helpers import KernelInfo
from test.amd.bench_emu import REMU_PATH
def set_valid_mem_ranges(ranges): pass # emu2 doesn't need this

View file

@ -5,9 +5,9 @@ from tinygrad.uop.ops import UOp, Ops, KernelInfo
from tinygrad.renderer import Estimates
from tinygrad.runtime.support.compiler_amd import HIPCompiler
from extra.assembly.amd.autogen.rdna3.ins import *
from extra.assembly.amd.dsl import s, v, Inst
from extra.assembly.amd.test.disasm import disasm as disasm_inst
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from tinygrad.renderer.amd.dsl import s, v, Inst
from test.amd.disasm import disasm as disasm_inst
def assemble_insts(insts:list[Inst], name:str, arch:str, kernarg_size:int=8) -> tuple[UOp, UOp]:
kd = {"kernarg_size":kernarg_size, "user_sgpr_kernarg_segment_ptr":1, "next_free_vgpr":8, "next_free_sgpr":8, "wavefront_size32":1}

View file

@ -1,8 +1,8 @@
import unittest
from extra.assembly.amd.dsl import *
from extra.assembly.amd.dsl import VDSTYField
from extra.assembly.amd.autogen.rdna3.enum import VOP1Op, VOP2Op
from extra.assembly.amd.autogen.rdna3.ins import VOP1
from tinygrad.renderer.amd.dsl import *
from tinygrad.renderer.amd.dsl import VDSTYField
from tinygrad.runtime.autogen.amd.rdna3.enum import VOP1Op, VOP2Op
from tinygrad.runtime.autogen.amd.rdna3.ins import VOP1
class TestRegisters(unittest.TestCase):
def test_vgpr_single(self):

View file

@ -4,10 +4,10 @@ from collections import defaultdict
from tinygrad.helpers import DEBUG
from tinygrad.dtype import dtypes
from tinygrad.uop.ops import UOp, Ops
from extra.assembly.amd.emu import parse_pcode
from extra.assembly.amd.pcode import parse_expr
from extra.assembly.amd.autogen.rdna3.str_pcode import PCODE
from extra.assembly.amd.autogen.rdna3.enum import VOP1Op, VOP2Op, SOP2Op, DSOp
from tinygrad.renderer.amd.emu import parse_pcode
from tinygrad.renderer.amd.pcode import parse_expr
from tinygrad.runtime.autogen.amd.rdna3.str_pcode import PCODE
from tinygrad.runtime.autogen.amd.rdna3.enum import VOP1Op, VOP2Op, SOP2Op, DSOp
def _srcs():
"""Create minimal source variables for pcode parsing."""
@ -325,15 +325,15 @@ class TestAllPcode(unittest.TestCase):
self.assertGreaterEqual(pct, min_pct, f"[{arch}] {pct:.1f}% < {min_pct}% threshold")
def test_parse_all_cdna_pcode(self):
from extra.assembly.amd.autogen.cdna.str_pcode import PCODE as CDNA_PCODE
from tinygrad.runtime.autogen.amd.cdna.str_pcode import PCODE as CDNA_PCODE
self._parse_all_pcode(CDNA_PCODE, "CDNA", min_pct=60)
def test_parse_all_rdna3_pcode(self):
from extra.assembly.amd.autogen.rdna3.str_pcode import PCODE as RDNA3_PCODE
from tinygrad.runtime.autogen.amd.rdna3.str_pcode import PCODE as RDNA3_PCODE
self._parse_all_pcode(RDNA3_PCODE, "RDNA3", min_pct=90)
def test_parse_all_rdna4_pcode(self):
from extra.assembly.amd.autogen.rdna4.str_pcode import PCODE as RDNA4_PCODE
from tinygrad.runtime.autogen.amd.rdna4.str_pcode import PCODE as RDNA4_PCODE
self._parse_all_pcode(RDNA4_PCODE, "RDNA4", min_pct=65)
if __name__ == "__main__":

View file

@ -4,10 +4,10 @@
Note: Graphics-only formats (EXP, MUBUF, MTBUF, MIMG) are not supported - use GLOBAL/FLAT for memory access in compute.
"""
import unittest
from extra.assembly.amd.autogen.rdna3.ins import *
from extra.assembly.amd.dsl import VCC_HI, EXEC_LO, NULL
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from tinygrad.renderer.amd.dsl import VCC_HI, EXEC_LO, NULL
OFF = NULL # OFF is alias for NULL
from extra.assembly.amd import detect_format
from tinygrad.renderer.amd import detect_format
class TestDS(unittest.TestCase):

View file

@ -2,10 +2,10 @@
# the Inst constructor should be looking at the types of the fields to correctly set the value
import unittest, struct
from extra.assembly.amd.autogen.rdna3.ins import *
from extra.assembly.amd.dsl import Inst
from extra.assembly.amd.test.test_roundtrip import compile_asm
from extra.assembly.amd.test.disasm import disasm
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from tinygrad.renderer.amd.dsl import Inst
from test.amd.test_roundtrip import compile_asm
from test.amd.disasm import disasm
class IntegrationTestBase(unittest.TestCase):
inst: Inst
@ -133,11 +133,11 @@ class TestIntegrationCDNA(IntegrationTestBase):
arch = "cdna"
def test_mfma(self):
from extra.assembly.amd.autogen.cdna.ins import v_mfma_f32_16x16x16_f16
from tinygrad.runtime.autogen.amd.cdna.ins import v_mfma_f32_16x16x16_f16
self.inst = v_mfma_f32_16x16x16_f16(v[0:3], v[0:1], v[0:1], 0)
def test_mfma_fp8(self):
from extra.assembly.amd.autogen.cdna.ins import v_mfma_f32_16x16x128_f8f6f4
from tinygrad.runtime.autogen.amd.cdna.ins import v_mfma_f32_16x16x128_f8f6f4
self.inst = v_mfma_f32_16x16x128_f8f6f4(v[0:3], v[0:5], v[0:5], 1, cbsz=2, blgp=2)
class TestRegisterSliceSyntax(unittest.TestCase):

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""Integration test: round-trip RDNA3 assembly through AMD toolchain."""
import unittest, io, sys
from extra.assembly.amd.autogen.rdna3.ins import *
from tinygrad.runtime.autogen.amd.rdna3.ins import *
def waitcnt(vmcnt: int = 0x3f, expcnt: int = 0x7, lgkmcnt: int = 0x3f) -> int:
return (expcnt & 0x7) | ((lgkmcnt & 0x3f) << 4) | ((vmcnt & 0x3f) << 10)

View file

@ -10,9 +10,9 @@ Only compute-relevant instruction formats are tested. Graphics-only formats not
"""
import unittest, re, subprocess, functools
from tinygrad.helpers import fetch
from extra.assembly.amd.test.disasm import disasm
from extra.assembly.amd import decode_inst, detect_format
from extra.assembly.amd.test.helpers import get_llvm_mc, get_target, get_mattr
from test.amd.disasm import disasm
from tinygrad.renderer.amd import decode_inst, detect_format
from test.amd.helpers import get_llvm_mc, get_target, get_mattr
LLVM_BASE = "https://raw.githubusercontent.com/llvm/llvm-project/llvmorg-21.1.0/llvm/test/MC/AMDGPU"
@ -127,9 +127,9 @@ def _make_test(f: str, arch: str, test_type: str):
self.assertEqual(skipped, 0, f"{name}: {skipped} tests skipped, expected 0")
elif test_type == "repr":
# Test that eval(repr(inst)) reproduces the instruction
if arch == "rdna3": import extra.assembly.amd.autogen.rdna3.ins as ins # type: ignore[no-redef]
elif arch == "rdna4": import extra.assembly.amd.autogen.rdna4.ins as ins # type: ignore[no-redef]
elif arch == "cdna": import extra.assembly.amd.autogen.cdna.ins as ins # type: ignore[no-redef]
if arch == "rdna3": import tinygrad.runtime.autogen.amd.rdna3.ins as ins # type: ignore[no-redef]
elif arch == "rdna4": import tinygrad.runtime.autogen.amd.rdna4.ins as ins # type: ignore[no-redef]
elif arch == "cdna": import tinygrad.runtime.autogen.amd.cdna.ins as ins # type: ignore[no-redef]
ns = {k: getattr(ins, k) for k in dir(ins) if not k.startswith('_')}
passed, skipped = 0, 0
for _, data in tests:

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""Test PDF pseudocode extraction from generate.py."""
import unittest
from extra.assembly.amd.generate import extract_pdf_text, extract_pcode, parse_xml, ARCHS, FIXES
from tinygrad.renderer.amd.generate import extract_pdf_text, extract_pcode, parse_xml, ARCHS, FIXES
EXPECTED_PAGES = {"rdna3": 655, "rdna4": 711, "cdna": 610}

View file

@ -1,8 +1,8 @@
#!/usr/bin/env python3
import unittest, subprocess
from extra.assembly.amd.autogen.rdna3.ins import *
from extra.assembly.amd.test.helpers import get_llvm_mc
from extra.assembly.amd.test.disasm import disasm
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from test.amd.helpers import get_llvm_mc
from test.amd.disasm import disasm
def llvm_assemble(asm: str) -> bytes:
"""Assemble using llvm-mc and return bytes."""

View file

@ -1,7 +1,7 @@
import unittest, ctypes
from extra.assembly.amd.autogen.rdna4 import ins as ir4
from extra.assembly.amd.dsl import v, s
from extra.assembly.amd.emu import WaveState, decode_program
from tinygrad.runtime.autogen.amd.rdna4 import ins as ir4
from tinygrad.renderer.amd.dsl import v, s
from tinygrad.renderer.amd.emu import WaveState, decode_program
from tinygrad.device import Buffer, BufferSpec
from tinygrad.dtype import dtypes

View file

@ -1,9 +1,9 @@
#!/usr/bin/env python3
"""Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match."""
import unittest, io, sys, re, subprocess, os
from extra.assembly.amd import detect_format
from extra.assembly.amd.test.helpers import get_llvm_mc, get_llvm_objdump, get_target, get_mattr
from extra.assembly.amd.test.disasm import disasm
from tinygrad.renderer.amd import detect_format
from test.amd.helpers import get_llvm_mc, get_llvm_objdump, get_target, get_mattr
from test.amd.disasm import disasm
def disassemble_lib(lib: bytes, compiler) -> list[tuple[str, bytes]]:
"""Disassemble ELF binary and return list of (instruction_text, machine_code_bytes)."""
@ -82,7 +82,7 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
"""
arch = self.arch
from extra.assembly.amd.test.test_compare_emulators import get_kernels_from_tinygrad
from test.amd.test_compare_emulators import get_kernels_from_tinygrad
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
from tinygrad.helpers import AMD_LLVM

View file

@ -5,15 +5,16 @@ from pathlib import Path
from tinygrad.helpers import DEBUG
from tinygrad.runtime.autogen import rocprof
from tinygrad.runtime.support.elf import elf_loader
from extra.assembly.amd import decode_inst
from extra.assembly.amd.autogen.rdna3.ins import SOPP
from extra.assembly.amd.autogen.rdna3.enum import SOPPOp
from extra.assembly.amd.sqtt import (decode, LAYOUT_HEADER, WAVESTART, WAVESTART_RDNA4, WAVEEND, INST, INST_RDNA4, VALUINST,
from tinygrad.renderer.amd import decode_inst
from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP
from tinygrad.runtime.autogen.amd.rdna3.enum import SOPPOp
from tinygrad.renderer.amd.sqtt import (decode, LAYOUT_HEADER, WAVESTART, WAVESTART_RDNA4, WAVEEND, INST, INST_RDNA4, VALUINST,
IMMEDIATE, IMMEDIATE_MASK, PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4,
InstOp, InstOpRDNA4, print_packets)
from extra.assembly.amd.test.helpers import TARGET_TO_ARCH
from test.amd.helpers import TARGET_TO_ARCH
EXAMPLES_DIR = Path(__file__).parent.parent.parent.parent / "sqtt/examples"
import tinygrad
EXAMPLES_DIR = Path(tinygrad.__file__).parent.parent / "extra/sqtt/examples"
# INST ops for non-traced SIMDs (excluded from instruction count)
OTHER_SIMD_OPS = {InstOp.OTHER_LDS_LOAD, InstOp.OTHER_LDS_STORE, InstOp.OTHER_LDS_STORE_64, InstOp.OTHER_LDS_STORE_128,
InstOp.OTHER_FLAT_LOAD, InstOp.OTHER_FLAT_STORE, InstOp.OTHER_FLAT_STORE_64, InstOp.OTHER_FLAT_STORE_96,

View file

@ -3,7 +3,8 @@ import unittest, struct, ctypes, pickle
from pathlib import Path
ROCPROF_LIB = Path("/usr/lib/librocprof-trace-decoder.so")
EXAMPLES_DIR = Path(__file__).parent.parent.parent.parent / "sqtt/examples"
import tinygrad
EXAMPLES_DIR = Path(tinygrad.__file__).parent.parent / "extra/sqtt/examples"
# CDNA pkt_fmt -> size in bytes (extracted from rocprof hash table)
CDNA_PKT_SIZES = {0: 2, 1: 8, 2: 8, 3: 4, 4: 2, 5: 6, 6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 8, 12: 6, 13: 4, 14: 8, 15: 6}
@ -79,7 +80,7 @@ def extract_packet_encodings():
def extract_cdna_packet_sizes():
"""Extract CDNA pkt_fmt -> size mapping by running rocprof decoder to populate its hash table."""
from extra.assembly.amd.test.test_sqtt_examples import run_rocprof_decoder
from test.amd.test_sqtt_examples import run_rocprof_decoder
if not (pkl_path := next((EXAMPLES_DIR / "gfx950").glob("*.pkl"), None)): return None
with open(pkl_path, "rb") as f: data = pickle.load(f)
@ -124,7 +125,7 @@ class TestSQTTMatchesBinary(unittest.TestCase):
with self.subTest(pkt_fmt=pkt_fmt): self.assertEqual(pkt_sizes.get(pkt_fmt), size)
def test_cdna_packet_definitions(self):
from extra.assembly.amd.sqtt import PACKET_TYPES_CDNA
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_CDNA
for pkt_fmt, pkt_cls in PACKET_TYPES_CDNA.items():
with self.subTest(packet=pkt_cls.__name__):
self.assertEqual(pkt_cls.encoding.default, pkt_fmt)
@ -132,21 +133,21 @@ class TestSQTTMatchesBinary(unittest.TestCase):
def _test_bit_counts(self, layout: int):
if not (tables := extract_bit_tables()): self.skipTest("rocprof-trace-decoder not installed")
from extra.assembly.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
for type_id, pkt_cls in {3: PACKET_TYPES_RDNA3, 4: PACKET_TYPES_RDNA4}[layout].items():
with self.subTest(packet=pkt_cls.__name__):
self.assertEqual(pkt_cls._size_nibbles * 4, tables[layout - 2][type_id]) # type: ignore[attr-defined]
def _test_encodings(self, layout: int):
if not (encodings := extract_packet_encodings()): self.skipTest("rocprof-trace-decoder not installed")
from extra.assembly.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
for type_id, pkt_cls in {3: PACKET_TYPES_RDNA3, 4: PACKET_TYPES_RDNA4}[layout].items():
with self.subTest(packet=pkt_cls.__name__):
self.assertEqual((pkt_cls.encoding.mask, pkt_cls.encoding.default), encodings[layout - 2][type_id])
def _test_delta_fields(self, layout: int):
if not (deltas := extract_delta_fields()): self.skipTest("rocprof-trace-decoder not installed")
from extra.assembly.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
for type_id, pkt_cls in {3: PACKET_TYPES_RDNA3, 4: PACKET_TYPES_RDNA4}[layout].items():
if type_id not in deltas[layout - 2]: continue
delta = getattr(pkt_cls, 'delta', None)

View file

@ -3,11 +3,12 @@ import unittest, pickle
from typing import Iterator
from pathlib import Path
from tinygrad.helpers import DEBUG
from extra.assembly.amd.sqtt import print_packets, map_insts
from extra.assembly.amd.autogen.rdna3.ins import s_endpgm
from extra.assembly.amd.test.disasm import disasm
from tinygrad.renderer.amd.sqtt import print_packets, map_insts
from tinygrad.runtime.autogen.amd.rdna3.ins import s_endpgm
from test.amd.disasm import disasm
EXAMPLES_DIR = Path(__file__).parent.parent.parent.parent / "sqtt/examples"
import tinygrad
EXAMPLES_DIR = Path(tinygrad.__file__).parent.parent / "extra/sqtt/examples"
def rocprof_inst_traces_match(sqtt, prg, target):
from tinygrad.viz.serve import amd_decode

View file

@ -1,11 +1,11 @@
# ruff: noqa: F405
import unittest, subprocess, os
from extra.assembly.amd.autogen.rdna3.ins import * # noqa: F403
from extra.assembly.amd.dsl import s, v, Inst, NULL
from tinygrad.runtime.autogen.amd.rdna3.ins import * # noqa: F403
from tinygrad.renderer.amd.dsl import s, v, Inst, NULL
def assemble_kernel(insts:list[Inst], name:str="test") -> str:
kd = {"next_free_vgpr": 8, "next_free_sgpr": 8, "wavefront_size32": 1, "user_sgpr_kernarg_segment_ptr": 1, "kernarg_size": 8}
from extra.assembly.amd.test.disasm import disasm as _disasm
from test.amd.disasm import disasm as _disasm
disasm = "\n".join(_disasm(inst) for inst in insts)
hsasrc = f".text\n.globl {name}\n.p2align 8\n.type {name},@function\n{name}:\n{disasm}\n"
return hsasrc + f".rodata\n.p2align 6\n.amdhsa_kernel {name}\n" + "\n".join(f".amdhsa_{k} {v}" for k, v in kd.items()) + "\n.end_amdhsa_kernel"

View file

@ -6,8 +6,8 @@ Run with: AMD=1 python -m pytest test/external/external_test_gpu_crash.py -v
"""
import unittest, re
from tinygrad.device import Device
from extra.assembly.amd.autogen.rdna3.ins import * # noqa: F403
from extra.assembly.amd.dsl import s, v, Inst, NULL
from tinygrad.runtime.autogen.amd.rdna3.ins import * # noqa: F403
from tinygrad.renderer.amd.dsl import s, v, Inst, NULL
def assemble(code:str, name:str="test") -> str:
kd = {"next_free_vgpr": 8, "next_free_sgpr": 8, "wavefront_size32": 1, "user_sgpr_kernarg_segment_ptr": 1, "kernarg_size": 8}
@ -37,7 +37,7 @@ class TestGPUCrash(unittest.TestCase):
prg(self.dev.allocator.alloc(64), global_size=(1,1,1), local_size=(1,1,1), wait=True)
def _run_insts(self, insts: list[Inst]):
from extra.assembly.amd.test.disasm import disasm
from test.amd.disasm import disasm
self._run("\n".join(disasm(i) for i in insts))
def _assert_gpu_fault(self, func):

View file

@ -23,7 +23,7 @@ class PythonRemu:
arch: str = "rdna3" # Architecture: rdna3 or rdna4
def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int:
from extra.assembly.amd.emu import run_asm
from tinygrad.renderer.amd.emu import run_asm
return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2, self.scratch_size, self.arch)
def _try_dlopen_remu():

View file

@ -10,8 +10,8 @@ from tinygrad.device import Compiler
from tinygrad.runtime.support.compiler_amd import HIPCompiler
from tinygrad.viz.serve import amdgpu_cfg
from extra.assembly.amd.autogen.rdna3.ins import *
from extra.assembly.amd.dsl import s
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from tinygrad.renderer.amd.dsl import s
# TODO: this belongs to the dsl infrastructure
from extra.gemm.amd_asm_matmul import Kernel

View file

@ -1,7 +1,7 @@
An integrated environment for AMD GPU assembly and emulation
Test with `PYTHONPATH="." pytest -n12 extra/assembly/amd/`
`AMD_LLVM=1 PYTHONPATH="." pytest -n12 extra/assembly/amd/`
Test with `pytest -n12 test/amd/`
`AMD_LLVM=1 pytest -n12 test/amd/`
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
* emu.py -- an emulator for RDNA that runs in tinygrad with `AMD=1 MOCKGPU=1 PYTHON_REMU=1`
@ -11,7 +11,7 @@ Test with `PYTHONPATH="." pytest -n12 extra/assembly/amd/`
The code should be as readable and deduplicated as possible. asm and emu shouldn't be required for dsl.
The autogen folder is autogenerated from the AMD PDFs with `python3 -m extra.assembly.amd.pdf --arch all`
The autogen folder is autogenerated from the AMD PDFs with `python3 -m tinygrad.renderer.amd.pdf --arch all`
test_emu.py has a good set of instruction tests for the emulation, with USE_HW=1 it will compare to real hardware.
Whenever an instruction is fixed, regression tests should be added here and confirmed with real hardware.
@ -20,20 +20,20 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
`PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
The ops tests also pass, but they are very slow, so you should run them one at a time.
`SKIP_SLOW_TEST=1 PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_ops.py`
When something is caught by main tinygrad tests, a local regression test should be added to `extra/assembly/amd/test`.
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
If a test is failing with `AMD=1 PYTHON_REMU=1 MOCKGPU=1` it's because an instruction is emulated incorrectly.
You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.
Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.
Get line count with `cloc --by-file extra/assembly/amd/*.py`
Get line count with `cloc --by-file tinygrad/renderer/amd/*.py`

View file

@ -0,0 +1,72 @@
# Instruction format detection and decoding
from __future__ import annotations
from tinygrad.renderer.amd.dsl import Inst, FixedBitField, EnumBitField
# SDWA/DPP variant detection: src0 field (bits 0-8) encodes the variant
# 0xf9 (249) = SDWA, 0xfa (250) = DPP16 for CDNA (GFX9)
_VARIANT_SRC0 = {"_SDWA_SDST": 0xf9, "_SDWA": 0xf9, "_DPP16": 0xfa}
def _matches(data: bytes, cls: type[Inst]) -> bool:
"""Check if data matches all FixedBitFields and op is in allowed."""
for _, field in cls._fields:
dword_idx = field.lo // 32
if len(data) < (dword_idx + 1) * 4: return False
word = int.from_bytes(data[dword_idx*4:(dword_idx+1)*4], 'little')
field_lo = field.lo % 32
if isinstance(field, FixedBitField):
if ((word >> field_lo) & field.mask) != field.default: return False
if isinstance(field, EnumBitField) and field.allowed is not None:
try: opcode = field.decode((word >> field_lo) & field.mask)
except ValueError: return False # opcode not in enum
if opcode not in field.allowed: return False
# Check SDWA/DPP variant based on src0 field (bits 0-8) - only for variant classes
name = cls.__name__
word = int.from_bytes(data[:4], 'little')
for suffix, expected_src0 in _VARIANT_SRC0.items():
if name.endswith(suffix): return (word & 0x1ff) == expected_src0
return True
# Lazy-load instruction format tables to avoid circular imports (ins.py imports dsl.py which is in this package)
_FORMATS: dict[str, list[type[Inst]]] | None = None
def _load_formats() -> dict[str, list[type[Inst]]]:
global _FORMATS
if _FORMATS is not None: return _FORMATS
from tinygrad.runtime.autogen.amd.rdna3.ins import (VOP1, VOP1_SDST, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3SD, VOP3P, VOPC, VOPD,
VINTERP, SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, SMEM, DS, FLAT, GLOBAL, SCRATCH)
from tinygrad.runtime.autogen.amd.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST, VOP1_LIT as R4_VOP1_LIT,
VOP2 as R4_VOP2, VOP2_LIT as R4_VOP2_LIT, VOP3 as R4_VOP3, VOP3_SDST as R4_VOP3_SDST, VOP3SD as R4_VOP3SD, VOP3P as R4_VOP3P,
VOPC as R4_VOPC, VOPD as R4_VOPD, VINTERP as R4_VINTERP, SOP1 as R4_SOP1, SOP1_LIT as R4_SOP1_LIT,
SOP2 as R4_SOP2, SOP2_LIT as R4_SOP2_LIT, SOPC as R4_SOPC, SOPC_LIT as R4_SOPC_LIT,
SOPK as R4_SOPK, SOPK_LIT as R4_SOPK_LIT, SOPP as R4_SOPP,
SMEM as R4_SMEM, DS as R4_DS, VFLAT as R4_FLAT, VGLOBAL as R4_GLOBAL, VSCRATCH as R4_SCRATCH)
from tinygrad.runtime.autogen.amd.cdna.ins import (VOP1 as C_VOP1, VOP1_SDWA as C_VOP1_SDWA, VOP1_DPP16 as C_VOP1_DPP16,
VOP2 as C_VOP2, VOP2_LIT as C_VOP2_LIT, VOP2_SDWA as C_VOP2_SDWA, VOP2_DPP16 as C_VOP2_DPP16,
VOPC as C_VOPC, VOPC_SDWA_SDST as C_VOPC_SDWA_SDST,
VOP3 as C_VOP3, VOP3_SDST as C_VOP3_SDST, VOP3SD as C_VOP3SD, VOP3P as C_VOP3P, VOP3P_MFMA as C_VOP3P_MFMA, VOP3PX2 as C_VOP3PX2,
SOP1 as C_SOP1, SOP2 as C_SOP2, SOPC as C_SOPC, SOPK as C_SOPK, SOPK_LIT as C_SOPK_LIT, SOPP as C_SOPP, SMEM as C_SMEM, DS as C_DS,
FLAT as C_FLAT, GLOBAL as C_GLOBAL, SCRATCH as C_SCRATCH, MUBUF as C_MUBUF)
# Order matters: more specific encodings first, catch-alls (SOP2, VOP2) last
# Order: base before _LIT (base matches regular ops, _LIT catches lit-only ops excluded from base)
_FORMATS = {
"rdna3": [VOPD, VOP3P, VINTERP, VOP3SD, VOP3_SDST, VOP3, DS, GLOBAL, SCRATCH, FLAT, SMEM,
SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, VOPC, VOP1_SDST, VOP1, VOP1_LIT, VOP2, VOP2_LIT],
"rdna4": [R4_VOPD, R4_VOP3P, R4_VINTERP, R4_VOP3SD, R4_VOP3_SDST, R4_VOP3, R4_DS, R4_GLOBAL, R4_SCRATCH, R4_FLAT, R4_SMEM,
R4_SOP1, R4_SOP1_LIT, R4_SOPC, R4_SOPC_LIT, R4_SOPP, R4_SOPK, R4_SOPK_LIT, R4_VOPC, R4_VOP1_SDST, R4_VOP1, R4_VOP1_LIT,
R4_SOP2, R4_SOP2_LIT, R4_VOP2, R4_VOP2_LIT],
"cdna": [C_VOP3PX2, C_VOP3P_MFMA, C_VOP3P, C_VOP3SD, C_VOP3_SDST, C_VOP3, C_DS, C_GLOBAL, C_SCRATCH, C_FLAT, C_MUBUF, C_SMEM,
C_SOP1, C_SOPC, C_SOPP, C_SOPK, C_SOPK_LIT, C_VOPC_SDWA_SDST, C_VOPC,
C_VOP1_DPP16, C_VOP1_SDWA, C_VOP1, C_VOP2_DPP16, C_VOP2_SDWA, C_SOP2, C_VOP2, C_VOP2_LIT],
}
return _FORMATS
def detect_format(data: bytes, arch: str = "rdna3") -> type[Inst]:
"""Detect instruction format from machine code bytes."""
assert len(data) >= 4, f"need at least 4 bytes, got {len(data)}"
for cls in _load_formats()[arch]:
if _matches(data, cls): return cls
raise ValueError(f"unknown {arch} format word={int.from_bytes(data[:4], 'little'):#010x}")
def decode_inst(data: bytes, arch: str = "rdna3") -> Inst:
"""Decode machine code bytes into an instruction."""
return detect_format(data, arch).from_bytes(data)

View file

@ -236,9 +236,9 @@ class VDSTYField(BitField):
# ══════════════════════════════════════════════════════════════
import functools
from extra.assembly.amd.autogen.rdna3.operands import OPERANDS as OPERANDS_RDNA3
from extra.assembly.amd.autogen.rdna4.operands import OPERANDS as OPERANDS_RDNA4
from extra.assembly.amd.autogen.cdna.operands import OPERANDS as OPERANDS_CDNA
from tinygrad.runtime.autogen.amd.rdna3.operands import OPERANDS as OPERANDS_RDNA3
from tinygrad.runtime.autogen.amd.rdna4.operands import OPERANDS as OPERANDS_RDNA4
from tinygrad.runtime.autogen.amd.cdna.operands import OPERANDS as OPERANDS_CDNA
OPERANDS = {**OPERANDS_CDNA, **OPERANDS_RDNA3, **OPERANDS_RDNA4}
# ══════════════════════════════════════════════════════════════

View file

@ -48,14 +48,14 @@ from tinygrad.runtime.autogen import hsa
from tinygrad.helpers import Context, DEBUG, colored
from tinygrad.engine.realize import get_runner
from extra.assembly.amd import decode_inst
from extra.assembly.amd.autogen.rdna3.str_pcode import PCODE as PCODE_RDNA3
from extra.assembly.amd.autogen.rdna4.str_pcode import PCODE as PCODE_RDNA4
from extra.assembly.amd.autogen.rdna3 import ins as ir3
from extra.assembly.amd.autogen.rdna4 import ins as ir4
from extra.assembly.amd.dsl import VCC_LO, EXEC_LO, SCC, ttmp
from extra.assembly.amd.autogen.common import Fmt, OpType
from extra.assembly.amd.pcode import parse_block, _FUNCS
from tinygrad.renderer.amd import decode_inst
from tinygrad.runtime.autogen.amd.rdna3.str_pcode import PCODE as PCODE_RDNA3
from tinygrad.runtime.autogen.amd.rdna4.str_pcode import PCODE as PCODE_RDNA4
from tinygrad.runtime.autogen.amd.rdna3 import ins as ir3
from tinygrad.runtime.autogen.amd.rdna4 import ins as ir4
from tinygrad.renderer.amd.dsl import VCC_LO, EXEC_LO, SCC, ttmp
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
from tinygrad.renderer.amd.pcode import parse_block, _FUNCS
MASK32 = 0xFFFFFFFF

View file

@ -265,7 +265,7 @@ def write_common(all_fmts: dict[str, int], all_op_types: set[str], path: pathlib
def write_enum(enums, path):
lines: list[str] = ["# autogenerated from AMD ISA XML - do not edit",
"from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401", ""]
"from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401", ""]
for name, ops in sorted(enums.items()):
if not ops: continue
suffix = "_E32" if name in ("VOP1", "VOP2", "VOPC") else "_E64" if name == "VOP3" else ""
@ -344,8 +344,8 @@ def write_ins(encodings, enums, suffix_only_ops, types, arch, path):
"SCC", "VCCZ", "EXECZ", "ttmp", "INV_2PI", "SDWA", "DPP", "DPP16", "LIT", "SRC_LDS_DIRECT"]
dsl_reexport = sorted(set(dsl_names + _DSL_REGS))
lines: list[str] = ["# autogenerated from AMD ISA XML - do not edit", "# ruff: noqa: E501,F401",
f"from extra.assembly.amd.dsl import {', '.join(dsl_reexport)}",
f"from extra.assembly.amd.autogen.{arch}.enum import {', '.join(enum_names)}", "import functools", ""]
f"from tinygrad.renderer.amd.dsl import {', '.join(dsl_reexport)}",
f"from tinygrad.runtime.autogen.amd.{arch}.enum import {', '.join(enum_names)}", "import functools", ""]
def fmt_allowed(op_enum: str, ops: set[int]) -> str:
"""Format allowed ops as {EnumName.MEMBER, ...}."""
@ -456,8 +456,8 @@ def write_operands(types: dict, enums: dict, arch: str, path: pathlib.Path) -> N
used_bases = {eb for (nm, eb) in types if (nm, eb) in valid}
enum_names = sorted(f"{k}Op" for k in used_bases)
lines: list[str] = ["# autogenerated from AMD ISA XML - do not edit",
"from extra.assembly.amd.autogen.common import Fmt, OpType",
f"from extra.assembly.amd.autogen.{arch}.enum import {', '.join(enum_names)}", ""]
"from tinygrad.runtime.autogen.amd.common import Fmt, OpType",
f"from tinygrad.runtime.autogen.amd.{arch}.enum import {', '.join(enum_names)}", ""]
lines.append("# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}")
lines.append("OPERANDS = {")
def fmt_val(v):
@ -481,7 +481,7 @@ def write_pcode(pcode: dict[tuple[str, int], str], enums: dict[str, dict[int, st
entries.append((f"{fmt_name}Op", f"{name}{msuf}", opcode, pcode[(name, opcode)]))
enum_names = sorted(set(e[0] for e in entries))
lines = ["# autogenerated from AMD ISA PDF - do not edit", "# ruff: noqa: E501",
f"from extra.assembly.amd.autogen.{arch}.enum import {', '.join(enum_names)}", "", "PCODE = {"]
f"from tinygrad.runtime.autogen.amd.{arch}.enum import {', '.join(enum_names)}", "", "PCODE = {"]
for enum_name, name, opcode, code in sorted(entries, key=lambda x: (x[0], x[2])):
lines.append(f" {enum_name}.{name}: {code!r},")
lines.append("}")
@ -508,12 +508,13 @@ if __name__ == "__main__":
all_fmts[fmt] = bits
all_op_types.update(op_types_set)
# Write common.py
common_path = pathlib.Path(__file__).parent / "autogen" / "common.py"
autogen_base = pathlib.Path(__file__).parents[2] / "runtime" / "autogen" / "amd"
common_path = autogen_base / "common.py"
write_common(all_fmts, all_op_types, common_path)
print(f"Wrote common.py: {len(all_fmts)} formats, {len(all_op_types)} op types")
# Write per-arch files from XML
for arch, data in arch_data.items():
base = pathlib.Path(__file__).parent / "autogen" / arch
base = autogen_base / arch
write_enum(data["enums"], base / "enum.py")
write_ins(data["encodings"], data["enums"], data["suffix_only_ops"], data["types"], arch, base / "ins.py")
write_operands(data["types"], data["enums"], arch, base / "operands.py")
@ -524,6 +525,6 @@ if __name__ == "__main__":
pages = extract_pdf_text(cfg["pdf"])
name_to_op = {name: op for ops in arch_data[arch]["enums"].values() for op, name in ops.items()}
pcode = extract_pcode(pages, name_to_op)
base = pathlib.Path(__file__).parent / "autogen" / arch
base = autogen_base / arch
write_pcode(pcode, arch_data[arch]["enums"], arch, base / "str_pcode.py")
print(f" {arch}: {len(pcode)} pcode entries")

View file

@ -8,9 +8,9 @@ from __future__ import annotations
from dataclasses import dataclass
from typing import Iterator
from enum import Enum
from extra.assembly.amd.dsl import BitField, FixedBitField, Inst, bits
from extra.assembly.amd.autogen.rdna3.ins import SOPP, s_endpgm
from extra.assembly.amd.autogen.rdna3.enum import SOPPOp
from tinygrad.renderer.amd.dsl import BitField, FixedBitField, Inst, bits
from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP, s_endpgm
from tinygrad.runtime.autogen.amd.rdna3.enum import SOPPOp
# ═══════════════════════════════════════════════════════════════════════════════
# FIELD ENUMS

View file

@ -1,5 +1,5 @@
# autogenerated from AMD ISA XML - do not edit
from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
class DSOp(ReprEnum):
DS_ADD_U32 = 0

View file

@ -1,7 +1,7 @@
# autogenerated from AMD ISA XML - do not edit
# ruff: noqa: E501,F401
from extra.assembly.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VGPRField, s, src, ttmp, v
from extra.assembly.amd.autogen.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp, HWREG
from tinygrad.renderer.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VGPRField, s, src, ttmp, v
from tinygrad.runtime.autogen.amd.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp, HWREG
import functools
class DS(Inst):

View file

@ -1,6 +1,6 @@
# autogenerated from AMD ISA XML - do not edit
from extra.assembly.amd.autogen.common import Fmt, OpType
from extra.assembly.amd.autogen.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
from tinygrad.runtime.autogen.amd.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp
# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}
OPERANDS = {

View file

@ -1,6 +1,6 @@
# autogenerated from AMD ISA PDF - do not edit
# ruff: noqa: E501
from extra.assembly.amd.autogen.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
from tinygrad.runtime.autogen.amd.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
PCODE = {
DSOp.DS_ADD_U32: 'addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32);\ntmp = MEM[addr].u32;\nMEM[addr].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',

View file

@ -1,5 +1,5 @@
# autogenerated from AMD ISA XML - do not edit
from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
class DSOp(ReprEnum):
DS_ADD_U32 = 0

View file

@ -1,7 +1,7 @@
# autogenerated from AMD ISA XML - do not edit
# ruff: noqa: E501,F401
from extra.assembly.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
from extra.assembly.amd.autogen.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, HWREG, MSG
from tinygrad.renderer.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
from tinygrad.runtime.autogen.amd.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, HWREG, MSG
import functools
class DS(Inst):

View file

@ -1,6 +1,6 @@
# autogenerated from AMD ISA XML - do not edit
from extra.assembly.amd.autogen.common import Fmt, OpType
from extra.assembly.amd.autogen.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
from tinygrad.runtime.autogen.amd.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp
# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}
OPERANDS = {

View file

@ -1,6 +1,6 @@
# autogenerated from AMD ISA PDF - do not edit
# ruff: noqa: E501
from extra.assembly.amd.autogen.rdna3.enum import DSOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
from tinygrad.runtime.autogen.amd.rdna3.enum import DSOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
PCODE = {
DSOp.DS_ADD_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',

View file

@ -1,5 +1,5 @@
# autogenerated from AMD ISA XML - do not edit
from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
class DSOp(ReprEnum):
DS_ADD_U32 = 0

View file

@ -1,7 +1,7 @@
# autogenerated from AMD ISA XML - do not edit
# ruff: noqa: E501,F401
from extra.assembly.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
from extra.assembly.amd.autogen.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp, HWREG, MSG
from tinygrad.renderer.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
from tinygrad.runtime.autogen.amd.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp, HWREG, MSG
import functools
class DS(Inst):

View file

@ -1,6 +1,6 @@
# autogenerated from AMD ISA XML - do not edit
from extra.assembly.amd.autogen.common import Fmt, OpType
from extra.assembly.amd.autogen.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
from tinygrad.runtime.autogen.amd.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}
OPERANDS = {

View file

@ -1,6 +1,6 @@
# autogenerated from AMD ISA PDF - do not edit
# ruff: noqa: E501
from extra.assembly.amd.autogen.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
from tinygrad.runtime.autogen.amd.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
PCODE = {
DSOp.DS_ADD_U32: 'addr = CalcDsAddr(vgpr_a.b32, offset.b32);\ntmp = MEM[addr].u32;\nMEM[addr].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',

View file

@ -309,7 +309,7 @@ def load_counters(profile:list[ProfileEvent]) -> None:
ctxs.append({"name":f"Exec {name}"+(f" n{run_number[k]}" if run_number[k] > 1 else ""), "steps":steps})
def sqtt_timeline(data:bytes, lib:bytes, target:int) -> list[ProfileEvent]:
from extra.assembly.amd.sqtt import map_insts, InstructionInfo, PacketType, INST, InstOp, VALUINST, IMMEDIATE, IMMEDIATE_MASK, VMEMEXEC, ALUEXEC
from tinygrad.renderer.amd.sqtt import map_insts, InstructionInfo, PacketType, INST, InstOp, VALUINST, IMMEDIATE, IMMEDIATE_MASK, VMEMEXEC, ALUEXEC
ret:list[ProfileEvent] = []
rows:dict[str, None] = {}
trace:dict[str, set[int]] = {}
@ -430,10 +430,10 @@ def amd_readelf(lib:bytes) -> list[dict]:
return [{"label":f"{resource} Alloc", "value":val} for resource,val in [("VGPR", (vgpr_gran+1)*8-7), ("LDS",kd.group_segment_fixed_size),
("Scratch", kd.private_segment_fixed_size)] if val > 0]
def amd_decode(lib:bytes, target:int) -> dict[int, Any]: # Any is the Inst class from extra.assembly.amd.dsl
def amd_decode(lib:bytes, target:int) -> dict[int, Any]: # Any is the Inst class from tinygrad.renderer.amd.dsl
from tinygrad.runtime.support.elf import elf_loader
from extra.assembly.amd import detect_format
from extra.assembly.amd.dsl import Inst
from tinygrad.renderer.amd import detect_format
from tinygrad.renderer.amd.dsl import Inst
image, sections, _ = elf_loader(lib)
text = next((sh for sh in sections if sh.name == ".text"), None)
assert text is not None, "no .text section found in ELF"
@ -486,7 +486,7 @@ def amdgpu_cfg(lib:bytes, target:int) -> dict:
else: paths[curr].update([(nx+offset, COND_TAKEN), (nx, COND_NOT_TAKEN)])
elif nx in leaders: paths[curr][nx] = UNCOND
pc_tokens:dict[int, list[dict]] = {}
from extra.assembly.amd.dsl import Reg
from tinygrad.renderer.amd.dsl import Reg
for pc, inst in pc_table.items():
pc_tokens[pc] = tokens = []
for name, field in inst._fields: