mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
renderer/amd: move in tree (#14702)
* renderer/amd: move in tree * fix paths in tests * 24000 lines * no delete for amd files
This commit is contained in:
parent
d5fc3ea1ba
commit
4680247e35
71 changed files with 266 additions and 253 deletions
2
.github/workflows/autogen.yml
vendored
2
.github/workflows/autogen.yml
vendored
|
|
@ -43,7 +43,7 @@ jobs:
|
|||
run: sudo apt-get install -y --no-install-recommends libclang-20-dev llvm-20-dev hip-dev libusb-1.0-0-dev libdrm-dev
|
||||
- name: Regenerate autogen files
|
||||
run: |
|
||||
find tinygrad/runtime/autogen -type f -name "*.py" -not -name "__init__.py" -not -name "comgr_3.py" -not -name "metal.py" -not -name "iokit.py" -not -name "corefoundation.py" -not -name "libclang.py" -delete
|
||||
find tinygrad/runtime/autogen -type f -name "*.py" -not -path "*/amd/*" -not -name "__init__.py" -not -name "comgr_3.py" -not -name "metal.py" -not -name "iokit.py" -not -name "corefoundation.py" -not -name "libclang.py" -delete
|
||||
python3 -c "from tinygrad.runtime.autogen import opencl"
|
||||
python3 -c "from tinygrad.runtime.autogen import cuda, nvrtc, nvjitlink, nv_570, nv_580, nv"
|
||||
python3 -c "from tinygrad.runtime.autogen import comgr, hsa, hip, amd_gpu, sqtt, rocprof, amdgpu_kd, amdgpu_drm"
|
||||
|
|
|
|||
14
.github/workflows/test.yml
vendored
14
.github/workflows/test.yml
vendored
|
|
@ -295,8 +295,8 @@ jobs:
|
|||
python extra/optimization/extract_dataset.py
|
||||
gzip -c /tmp/sops > extra/datasets/sops.gz
|
||||
#DEBUG=1 MIN_ASTS=1 python extra/optimization/get_action_space.py
|
||||
- name: Repo line count < 20000 lines
|
||||
run: MAX_LINE_COUNT=20000 python sz.py
|
||||
- name: Repo line count < 24000 lines
|
||||
run: MAX_LINE_COUNT=24000 python sz.py
|
||||
|
||||
spec:
|
||||
strategy:
|
||||
|
|
@ -683,8 +683,8 @@ jobs:
|
|||
python-version: '3.14'
|
||||
- name: Verify AMD autogen is up to date
|
||||
run: |
|
||||
python -m extra.assembly.amd.generate
|
||||
git diff --exit-code extra/assembly/amd/autogen/
|
||||
python -m tinygrad.renderer.amd.generate
|
||||
git diff --exit-code tinygrad/runtime/autogen/amd/
|
||||
- name: Install LLVM 21
|
||||
run: |
|
||||
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
|
||||
|
|
@ -692,13 +692,13 @@ jobs:
|
|||
sudo apt-get update
|
||||
sudo apt-get install llvm-21 llvm-21-tools cloc
|
||||
- name: RDNA3 Line Count
|
||||
run: cloc --by-file extra/assembly/amd/*.py
|
||||
run: cloc --by-file tinygrad/renderer/amd/*.py
|
||||
- name: Install rocprof-trace-decoder
|
||||
run: sudo PYTHONPATH="." ./extra/sqtt/install_sqtt_decoder.py
|
||||
- name: Run RDNA3 emulator tests
|
||||
run: AMD_LLVM=0 python -m pytest -n=auto extra/assembly/amd/ --durations 20
|
||||
run: AMD_LLVM=0 python -m pytest -n=auto test/amd/ --durations 20
|
||||
- name: Run RDNA3 emulator tests (AMD_LLVM=1)
|
||||
run: AMD_LLVM=1 python -m pytest -n=auto extra/assembly/amd/ --durations 20
|
||||
run: AMD_LLVM=1 python -m pytest -n=auto test/amd/ --durations 20
|
||||
- name: Run RDNA3 dtype tests
|
||||
run: AMD_LLVM=0 pytest -n=auto test/backend/test_dtype_alu.py test/backend/test_dtype.py --durations 20
|
||||
- name: Run RDNA3 dtype tests (AMD_LLVM=1)
|
||||
|
|
|
|||
|
|
@ -76,12 +76,12 @@ VIZ=1 python -c "from tinygrad import Tensor; Tensor.ones(10).sum().realize()"
|
|||
## Auto-generated Files (DO NOT EDIT)
|
||||
|
||||
The following files are auto-generated and should never be edited manually:
|
||||
- `extra/assembly/amd/autogen/{arch}/__init__.py` - Generated by `python -m extra.assembly.amd.dsl --arch {arch}`
|
||||
- `extra/assembly/amd/autogen/{arch}/gen_pcode.py` - Generated by `python -m extra.assembly.amd.pcode --arch {arch}`
|
||||
- `tinygrad/runtime/autogen/amd/{arch}/__init__.py` - Generated by `python -m tinygrad.renderer.amd.dsl --arch {arch}`
|
||||
- `tinygrad/runtime/autogen/amd/{arch}/gen_pcode.py` - Generated by `python -m tinygrad.renderer.amd.pcode --arch {arch}`
|
||||
|
||||
Where `{arch}` is one of: `rdna3`, `rdna4`, `cdna`
|
||||
|
||||
To add missing instruction implementations, add them to `extra/assembly/amd/emu.py` instead.
|
||||
To add missing instruction implementations, add them to `tinygrad/renderer/amd/emu.py` instead.
|
||||
|
||||
## Style Notes
|
||||
|
||||
|
|
|
|||
|
|
@ -1,67 +0,0 @@
|
|||
# Instruction format detection and decoding
|
||||
from __future__ import annotations
|
||||
from extra.assembly.amd.dsl import Inst, FixedBitField, EnumBitField
|
||||
|
||||
# SDWA/DPP variant detection: src0 field (bits 0-8) encodes the variant
|
||||
# 0xf9 (249) = SDWA, 0xfa (250) = DPP16 for CDNA (GFX9)
|
||||
_VARIANT_SRC0 = {"_SDWA_SDST": 0xf9, "_SDWA": 0xf9, "_DPP16": 0xfa}
|
||||
|
||||
def _matches(data: bytes, cls: type[Inst]) -> bool:
|
||||
"""Check if data matches all FixedBitFields and op is in allowed."""
|
||||
for _, field in cls._fields:
|
||||
dword_idx = field.lo // 32
|
||||
if len(data) < (dword_idx + 1) * 4: return False
|
||||
word = int.from_bytes(data[dword_idx*4:(dword_idx+1)*4], 'little')
|
||||
field_lo = field.lo % 32
|
||||
if isinstance(field, FixedBitField):
|
||||
if ((word >> field_lo) & field.mask) != field.default: return False
|
||||
if isinstance(field, EnumBitField) and field.allowed is not None:
|
||||
try: opcode = field.decode((word >> field_lo) & field.mask)
|
||||
except ValueError: return False # opcode not in enum
|
||||
if opcode not in field.allowed: return False
|
||||
# Check SDWA/DPP variant based on src0 field (bits 0-8) - only for variant classes
|
||||
name = cls.__name__
|
||||
word = int.from_bytes(data[:4], 'little')
|
||||
for suffix, expected_src0 in _VARIANT_SRC0.items():
|
||||
if name.endswith(suffix): return (word & 0x1ff) == expected_src0
|
||||
return True
|
||||
|
||||
# Import instruction classes for each architecture
|
||||
from extra.assembly.amd.autogen.rdna3.ins import (VOP1, VOP1_SDST, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3SD, VOP3P, VOPC, VOPD, VINTERP,
|
||||
SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, SMEM, DS, FLAT, GLOBAL, SCRATCH)
|
||||
from extra.assembly.amd.autogen.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST, VOP1_LIT as R4_VOP1_LIT,
|
||||
VOP2 as R4_VOP2, VOP2_LIT as R4_VOP2_LIT, VOP3 as R4_VOP3, VOP3_SDST as R4_VOP3_SDST, VOP3SD as R4_VOP3SD, VOP3P as R4_VOP3P,
|
||||
VOPC as R4_VOPC, VOPD as R4_VOPD, VINTERP as R4_VINTERP, SOP1 as R4_SOP1, SOP1_LIT as R4_SOP1_LIT,
|
||||
SOP2 as R4_SOP2, SOP2_LIT as R4_SOP2_LIT, SOPC as R4_SOPC, SOPC_LIT as R4_SOPC_LIT,
|
||||
SOPK as R4_SOPK, SOPK_LIT as R4_SOPK_LIT, SOPP as R4_SOPP,
|
||||
SMEM as R4_SMEM, DS as R4_DS, VFLAT as R4_FLAT, VGLOBAL as R4_GLOBAL, VSCRATCH as R4_SCRATCH)
|
||||
from extra.assembly.amd.autogen.cdna.ins import (VOP1 as C_VOP1, VOP1_SDWA as C_VOP1_SDWA, VOP1_DPP16 as C_VOP1_DPP16,
|
||||
VOP2 as C_VOP2, VOP2_LIT as C_VOP2_LIT, VOP2_SDWA as C_VOP2_SDWA, VOP2_DPP16 as C_VOP2_DPP16,
|
||||
VOPC as C_VOPC, VOPC_SDWA_SDST as C_VOPC_SDWA_SDST,
|
||||
VOP3 as C_VOP3, VOP3_SDST as C_VOP3_SDST, VOP3SD as C_VOP3SD, VOP3P as C_VOP3P, VOP3P_MFMA as C_VOP3P_MFMA, VOP3PX2 as C_VOP3PX2,
|
||||
SOP1 as C_SOP1, SOP2 as C_SOP2, SOPC as C_SOPC, SOPK as C_SOPK, SOPK_LIT as C_SOPK_LIT, SOPP as C_SOPP, SMEM as C_SMEM, DS as C_DS,
|
||||
FLAT as C_FLAT, GLOBAL as C_GLOBAL, SCRATCH as C_SCRATCH, MUBUF as C_MUBUF)
|
||||
|
||||
# Order matters: more specific encodings first, catch-alls (SOP2, VOP2) last
|
||||
# Order: base before _LIT (base matches regular ops, _LIT catches lit-only ops excluded from base)
|
||||
_FORMATS = {
|
||||
"rdna3": [VOPD, VOP3P, VINTERP, VOP3SD, VOP3_SDST, VOP3, DS, GLOBAL, SCRATCH, FLAT, SMEM,
|
||||
SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, VOPC, VOP1_SDST, VOP1, VOP1_LIT, VOP2, VOP2_LIT],
|
||||
"rdna4": [R4_VOPD, R4_VOP3P, R4_VINTERP, R4_VOP3SD, R4_VOP3_SDST, R4_VOP3, R4_DS, R4_GLOBAL, R4_SCRATCH, R4_FLAT, R4_SMEM,
|
||||
R4_SOP1, R4_SOP1_LIT, R4_SOPC, R4_SOPC_LIT, R4_SOPP, R4_SOPK, R4_SOPK_LIT, R4_VOPC, R4_VOP1_SDST, R4_VOP1, R4_VOP1_LIT,
|
||||
R4_SOP2, R4_SOP2_LIT, R4_VOP2, R4_VOP2_LIT],
|
||||
"cdna": [C_VOP3PX2, C_VOP3P_MFMA, C_VOP3P, C_VOP3SD, C_VOP3_SDST, C_VOP3, C_DS, C_GLOBAL, C_SCRATCH, C_FLAT, C_MUBUF, C_SMEM,
|
||||
C_SOP1, C_SOPC, C_SOPP, C_SOPK, C_SOPK_LIT, C_VOPC_SDWA_SDST, C_VOPC,
|
||||
C_VOP1_DPP16, C_VOP1_SDWA, C_VOP1, C_VOP2_DPP16, C_VOP2_SDWA, C_SOP2, C_VOP2, C_VOP2_LIT],
|
||||
}
|
||||
|
||||
def detect_format(data: bytes, arch: str = "rdna3") -> type[Inst]:
|
||||
"""Detect instruction format from machine code bytes."""
|
||||
assert len(data) >= 4, f"need at least 4 bytes, got {len(data)}"
|
||||
for cls in _FORMATS[arch]:
|
||||
if _matches(data, cls): return cls
|
||||
raise ValueError(f"unknown {arch} format word={int.from_bytes(data[:4], 'little'):#010x}")
|
||||
|
||||
def decode_inst(data: bytes, arch: str = "rdna3") -> Inst:
|
||||
"""Decode machine code bytes into an instruction."""
|
||||
return detect_format(data, arch).from_bytes(data)
|
||||
|
|
@ -14,8 +14,8 @@ from tinygrad import Tensor, Device, Context, GlobalCounters
|
|||
from tinygrad.uop.ops import UOp, Ops, KernelInfo
|
||||
from tinygrad.helpers import getenv, colored
|
||||
from tinygrad.engine.realize import Estimates
|
||||
from extra.assembly.amd.dsl import s, v, VCC_LO, NULL
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from tinygrad.renderer.amd.dsl import s, v, VCC_LO, NULL
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
|
||||
# =============================================================================
|
||||
# Kernel constants
|
||||
|
|
@ -192,7 +192,7 @@ class Kernel:
|
|||
inst.simm16 = offset_dwords
|
||||
|
||||
# TODO: replace this with direct ELF
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from test.amd.disasm import disasm
|
||||
body = ['\t' + disasm(inst) for inst in self.instructions]
|
||||
|
||||
# limit wave occupancy by using more LDS
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
from extra.assembly.amd.autogen.cdna.ins import *
|
||||
from tinygrad.runtime.autogen.amd.cdna.ins import *
|
||||
from tinygrad.dtype import dtypes
|
||||
|
||||
# M0 is encoded with 124 (NULL in RDNA) in CDNA
|
||||
|
|
@ -73,7 +73,7 @@ class Kernel:
|
|||
lines, pos = [], 0
|
||||
for inst in self.instructions:
|
||||
if (label := self.label_at_pos.get(pos)) is not None: lines.append(f"{label}:")
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from test.amd.disasm import disasm
|
||||
lines.append(f" {disasm(inst)}" if inst._target is None else f" {inst.op_name.lower()} {inst._target}")
|
||||
pos += inst.size()
|
||||
return "\n".join(lines)
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ os.environ["AMD_AQL"] = "1"
|
|||
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler
|
||||
from extra.assembly.amd.dsl import Reg, Inst, s, v
|
||||
from tinygrad.renderer.amd.dsl import Reg, Inst, s, v
|
||||
|
||||
NUM_WORKGROUPS = 96
|
||||
WAVE_SIZE = 32
|
||||
|
|
@ -51,7 +51,7 @@ if __name__=="__main__":
|
|||
|
||||
COMPILER = HIPCompiler(arch)
|
||||
if arch in {'gfx1100', 'gfx1103', 'gfx1151'}:
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
if arch == 'gfx1103': NUM_WORKGROUPS = 8
|
||||
if arch == 'gfx1151': NUM_WORKGROUPS = 32
|
||||
launchBenchmark(v_wmma_bf16_16x16x16_bf16, (7,8,15))
|
||||
|
|
@ -61,7 +61,7 @@ if __name__=="__main__":
|
|||
launchBenchmark(v_wmma_i32_16x16x16_iu4, (7,8,9))
|
||||
launchBenchmark(v_wmma_i32_16x16x16_iu8, (7,8,11))
|
||||
elif arch in {'gfx1200', 'gfx1201'}:
|
||||
from extra.assembly.amd.autogen.rdna4.ins import *
|
||||
from tinygrad.runtime.autogen.amd.rdna4.ins import *
|
||||
# this instruction does not exist in the rdna4 isa, use the co version
|
||||
s_sub_u32 = s_sub_co_u32
|
||||
NUM_WORKGROUPS = 64
|
||||
|
|
@ -90,7 +90,7 @@ if __name__=="__main__":
|
|||
FLOPS_PER_MATMUL = 16*16*64*2
|
||||
launchBenchmark(v_swmmac_i32_16x16x64_iu4, (7,8,9,10,13,14), False)
|
||||
elif arch == 'gfx950':
|
||||
from extra.assembly.amd.autogen.cdna.ins import *
|
||||
from tinygrad.runtime.autogen.amd.cdna.ins import *
|
||||
DIRECTIVE = ".amdhsa_accum_offset 4"
|
||||
NUM_WORKGROUPS = 256
|
||||
WAVE_SIZE = 64
|
||||
|
|
|
|||
|
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
|||
import enum, collections
|
||||
from typing import Iterator
|
||||
from tinygrad.helpers import colored
|
||||
from extra.assembly.amd.sqtt import PacketType, bits
|
||||
from tinygrad.renderer.amd.sqtt import PacketType, bits
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# STALL REASONS
|
||||
|
|
|
|||
|
|
@ -7,8 +7,8 @@ import subprocess, struct, math, functools
|
|||
from tinygrad import Tensor, dtypes, Device
|
||||
from tinygrad.helpers import getenv
|
||||
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from extra.assembly.amd.asm import waitcnt
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
from tinygrad.renderer.amd.asm import waitcnt
|
||||
|
||||
from test.testextra.test_cfg_viz import asm_kernel
|
||||
|
||||
|
|
|
|||
|
|
@ -27,9 +27,14 @@ packages = [
|
|||
'tinygrad.mixin',
|
||||
'tinygrad.nn',
|
||||
'tinygrad.renderer',
|
||||
'tinygrad.renderer.amd',
|
||||
'tinygrad.runtime',
|
||||
'tinygrad.runtime.autogen',
|
||||
'tinygrad.runtime.autogen.am',
|
||||
'tinygrad.runtime.autogen.amd',
|
||||
'tinygrad.runtime.autogen.amd.rdna3',
|
||||
'tinygrad.runtime.autogen.amd.rdna4',
|
||||
'tinygrad.runtime.autogen.amd.cdna',
|
||||
'tinygrad.runtime.graph',
|
||||
'tinygrad.runtime.support',
|
||||
'tinygrad.runtime.support.am',
|
||||
|
|
@ -128,7 +133,7 @@ debug = true
|
|||
|
||||
[tool.mypy]
|
||||
warn_unused_configs = true
|
||||
files = ["tinygrad", "extra/assembly/amd"]
|
||||
files = ["tinygrad"]
|
||||
ignore_missing_imports = true
|
||||
check_untyped_defs = true
|
||||
explicit_package_bases = true
|
||||
|
|
@ -142,9 +147,7 @@ strict_equality = true
|
|||
module = "extra.*"
|
||||
follow_imports = "skip"
|
||||
|
||||
[[tool.mypy.overrides]]
|
||||
module = "extra.assembly.amd.*"
|
||||
follow_imports = "normal"
|
||||
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
norecursedirs = [
|
||||
|
|
@ -184,7 +187,6 @@ exclude = [
|
|||
".git/",
|
||||
"docs/",
|
||||
"extra/",
|
||||
"!extra/assembly/amd/",
|
||||
"test/external/mlperf_resnet",
|
||||
"test/external/mlperf_unet3d",
|
||||
]
|
||||
|
|
@ -250,8 +252,8 @@ select = [
|
|||
"F841",
|
||||
]
|
||||
"tinygrad/runtime/autogen/**/*.py" = ["E501", "F401", "E722", "E731", "F821", "A006", "A002", "F811"]
|
||||
"extra/assembly/amd/autogen/**/*.py" = ["E501"]
|
||||
"extra/assembly/amd/test/**/*.py" = ["F403", "F405"]
|
||||
"tinygrad/runtime/autogen/amd/**/*.py" = ["E501"]
|
||||
"test/amd/**/*.py" = ["F403", "F405"]
|
||||
|
||||
[tool.ruff.format]
|
||||
exclude = ["*"]
|
||||
|
|
|
|||
|
|
@ -6,13 +6,15 @@ from pathlib import Path
|
|||
# Set AMD=1 before importing tinygrad
|
||||
os.environ["AMD"] = "1"
|
||||
|
||||
from extra.assembly.amd.emu import run_asm as python_run_asm, decode_program
|
||||
from extra.assembly.amd import decode_inst
|
||||
from extra.assembly.amd.autogen.rdna3.ins import SOPP, SOPPOp
|
||||
from tinygrad.renderer.amd.emu import run_asm as python_run_asm, decode_program
|
||||
from tinygrad.renderer.amd import decode_inst
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP, SOPPOp
|
||||
|
||||
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.so"
|
||||
import tinygrad
|
||||
EXTRA_DIR = Path(tinygrad.__file__).parent.parent / "extra"
|
||||
REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.so"
|
||||
if not REMU_PATH.exists():
|
||||
REMU_PATH = Path(__file__).parents[3] / "remu/target/release/libremu.dylib"
|
||||
REMU_PATH = EXTRA_DIR / "remu/target/release/libremu.dylib"
|
||||
|
||||
def get_rust_remu():
|
||||
"""Load the Rust libremu shared library."""
|
||||
|
|
@ -68,7 +70,7 @@ def benchmark_emulator(name: str, run_fn, kernel: bytes, global_size, local_size
|
|||
|
||||
def profile_instructions(kernel: bytes):
|
||||
"""Profile individual instruction compile times."""
|
||||
from extra.assembly.amd.emu import _get_runner, _canonical_runner_cache
|
||||
from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
|
||||
from tinygrad.helpers import Context
|
||||
_get_runner.cache_clear()
|
||||
_canonical_runner_cache.clear()
|
||||
|
|
@ -98,7 +100,7 @@ def profile_instructions(kernel: bytes):
|
|||
|
||||
def benchmark_python_split(kernel: bytes, global_size, local_size, args_ptr, rsrc2: int, iterations: int = 5):
|
||||
"""Benchmark Python emulator with compile and execution times."""
|
||||
from extra.assembly.amd.emu import _get_runner, _canonical_runner_cache
|
||||
from tinygrad.renderer.amd.emu import _get_runner, _canonical_runner_cache
|
||||
from tinygrad.helpers import Context
|
||||
_get_runner.cache_clear()
|
||||
_canonical_runner_cache.clear()
|
||||
|
|
@ -2,7 +2,7 @@
|
|||
from __future__ import annotations
|
||||
import re
|
||||
from typing import Callable
|
||||
from extra.assembly.amd.dsl import Inst, Reg
|
||||
from tinygrad.renderer.amd.dsl import Inst, Reg
|
||||
|
||||
# Special register mappings for disassembly
|
||||
SPECIAL_GPRS = {106: 'vcc_lo', 107: 'vcc_hi', 124: 'null', 125: 'm0', 126: 'exec_lo', 127: 'exec_hi',
|
||||
|
|
@ -81,17 +81,17 @@ def _num_srcs(inst) -> int:
|
|||
# IMPORTS
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
from extra.assembly.amd.autogen.rdna3.ins import (VOP1, VOP1_SDST, VOP1_SDST_LIT, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3_SDST_LIT,
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import (VOP1, VOP1_SDST, VOP1_SDST_LIT, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3_SDST_LIT,
|
||||
VOP3_LIT, VOP3SD, VOP3SD_LIT, VOP3P, VOP3P_LIT, VOPC, VOPC_LIT, VOPD, VOPD_LIT, VINTERP, SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPC_LIT,
|
||||
SOPK, SOPK_LIT, SOPP, SMEM, DS, FLAT, GLOBAL, SCRATCH, VOP2Op, VOPDOp, SOPPOp, HWREG, MSG)
|
||||
from extra.assembly.amd.autogen.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST,
|
||||
from tinygrad.runtime.autogen.amd.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST,
|
||||
VOP1_SDST_LIT as R4_VOP1_SDST_LIT, VOP1_LIT as R4_VOP1_LIT,
|
||||
VOP2 as R4_VOP2, VOP2_LIT as R4_VOP2_LIT, VOP3 as R4_VOP3, VOP3_SDST as R4_VOP3_SDST, VOP3_SDST_LIT as R4_VOP3_SDST_LIT, VOP3_LIT as R4_VOP3_LIT,
|
||||
VOP3SD as R4_VOP3SD, VOP3SD_LIT as R4_VOP3SD_LIT, VOP3P as R4_VOP3P, VOP3P_LIT as R4_VOP3P_LIT, VOPC as R4_VOPC, VOPC_LIT as R4_VOPC_LIT,
|
||||
VOPD as R4_VOPD, VOPD_LIT as R4_VOPD_LIT, VINTERP as R4_VINTERP, SOP1 as R4_SOP1, SOP1_LIT as R4_SOP1_LIT, SOP2 as R4_SOP2, SOP2_LIT as R4_SOP2_LIT,
|
||||
SOPC as R4_SOPC, SOPC_LIT as R4_SOPC_LIT, SOPK as R4_SOPK, SOPK_LIT as R4_SOPK_LIT, SOPP as R4_SOPP, SMEM as R4_SMEM, DS as R4_DS,
|
||||
VOPDOp as R4_VOPDOp, HWREG as HWREG_RDNA4, VFLAT as R4_FLAT, VGLOBAL as R4_GLOBAL, VSCRATCH as R4_SCRATCH)
|
||||
from extra.assembly.amd.autogen.cdna.ins import HWREG as HWREG_CDNA
|
||||
from tinygrad.runtime.autogen.amd.cdna.ins import HWREG as HWREG_CDNA
|
||||
|
||||
def _is_cdna(inst: Inst) -> bool: return 'cdna' in inst.__class__.__module__
|
||||
def _is_r4(inst: Inst) -> bool: return 'rdna4' in inst.__class__.__module__
|
||||
|
|
@ -631,7 +631,7 @@ def disasm(inst: Inst) -> str: return DISASM_HANDLERS[type(inst)](inst)
|
|||
# CDNA DISASSEMBLER SUPPORT
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
|
||||
from extra.assembly.amd.autogen.cdna.ins import (VOP1 as CDNA_VOP1, VOP1_LIT as CDNA_VOP1_LIT,
|
||||
from tinygrad.runtime.autogen.amd.cdna.ins import (VOP1 as CDNA_VOP1, VOP1_LIT as CDNA_VOP1_LIT,
|
||||
VOP1_SDWA as CDNA_VOP1_SDWA, VOP1_DPP16 as CDNA_VOP1_DPP16,
|
||||
VOP2 as CDNA_VOP2, VOP2_LIT as CDNA_VOP2_LIT, VOP2_SDWA as CDNA_VOP2_SDWA, VOP2_DPP16 as CDNA_VOP2_DPP16,
|
||||
VOPC as CDNA_VOPC, VOPC_LIT as CDNA_VOPC_LIT, VOPC_SDWA_SDST as CDNA_VOPC_SDWA_SDST,
|
||||
|
|
@ -4,10 +4,10 @@ Uses run_asm() with memory output, so tests can run on both emulator and real ha
|
|||
Set USE_HW=1 to run on both emulator and hardware, comparing results.
|
||||
"""
|
||||
import ctypes, math, os, struct
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
|
||||
from extra.assembly.amd.emu import run_asm
|
||||
from extra.assembly.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0
|
||||
from tinygrad.renderer.amd.emu import run_asm
|
||||
from tinygrad.renderer.amd.dsl import NULL, SCC, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, M0
|
||||
|
||||
def _i32(f: float) -> int: return struct.unpack('<I', struct.pack('<f', f))[0]
|
||||
def _f32(i: int) -> float: return struct.unpack('<f', struct.pack('<I', i & 0xFFFFFFFF))[0]
|
||||
|
|
@ -5,7 +5,7 @@ Includes: ds_store_b32, ds_load_b32, ds_store_2addr_*, ds_load_2addr_*,
|
|||
ds_inc_*, ds_dec_*, ds_cmpstore_*, ds_storexchg_*
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestDS2Addr(unittest.TestCase):
|
||||
"""Tests for DS_*_2ADDR instructions."""
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
Includes: flat_load_*, flat_store_*, flat_atomic_*
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestFlatAtomic(unittest.TestCase):
|
||||
"""Tests for FLAT atomic instructions."""
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
Includes: global_load_*, global_store_*, global_atomic_*, global_load_d16_*
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestGlobalAtomic(unittest.TestCase):
|
||||
"""Tests for GLOBAL atomic instructions."""
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
Includes: scratch_load_*, scratch_store_*
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestScratchStore(unittest.TestCase):
|
||||
"""Tests for SCRATCH store instructions."""
|
||||
|
|
@ -4,7 +4,7 @@ Includes: s_load_b32, s_load_b64, s_load_b128, s_load_b256, s_load_b512
|
|||
Tests both immediate and register offset addressing modes.
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
# Use offset into output buffer for test data (output buffer is 2124 bytes)
|
||||
TEST_OFFSET = 2000
|
||||
|
|
@ -4,7 +4,7 @@ Includes: s_add_u32, s_mov_b32, s_and_b32, s_or_b32, s_quadmask_b32, s_wqm_b32,
|
|||
s_cbranch_vccnz, s_cbranch_vccz
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestBasicScalar(unittest.TestCase):
|
||||
"""Tests for basic scalar operations."""
|
||||
|
|
@ -5,7 +5,7 @@ Includes: v_mov_b32, v_cvt_*, v_sin_f32, v_rcp_f32, v_exp_f32, v_rndne_f32,
|
|||
v_readfirstlane_b32
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestMov(unittest.TestCase):
|
||||
"""Tests for V_MOV_B32."""
|
||||
|
|
@ -5,7 +5,7 @@ Includes: v_add_f32, v_mul_f32, v_and_b32, v_or_b32, v_xor_b32,
|
|||
v_add_nc_u32, v_cndmask_b32, v_add_f16, v_mul_f16
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestBasicArithmetic(unittest.TestCase):
|
||||
"""Tests for basic arithmetic VOP2 instructions."""
|
||||
|
|
@ -4,7 +4,7 @@ Includes: v_fma_f32, v_div_scale_f32, v_div_fmas_f32, v_div_fixup_f32,
|
|||
v_alignbit_b32, v_bfe_i32, v_mad_u64_u32, v_readlane_b32, v_writelane_b32
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestFMA(unittest.TestCase):
|
||||
"""Tests for FMA instructions."""
|
||||
|
|
@ -2754,7 +2754,7 @@ class TestVOP3VOPC(unittest.TestCase):
|
|||
|
||||
def test_v_cmp_ge_f32_e64_nan(self):
|
||||
"""V_CMP_GE_F32_E64: |NaN| >= |0.0| should be FALSE (NaN comparisons always false)."""
|
||||
from extra.assembly.amd.autogen.rdna3.ins import VOP3_SDST
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import VOP3_SDST
|
||||
instructions = [
|
||||
s_mov_b32(s[0], 0xffc00000), # NaN
|
||||
s_mov_b32(s[1], 0x00000000), # 0.0
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
Includes: v_pk_add_f16, v_pk_mul_f16, v_pk_fma_f16, v_pack_b32_f16, v_wmma_*, v_dot2_*
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
class TestPackInstructions(unittest.TestCase):
|
||||
"""Tests for pack instructions."""
|
||||
|
|
@ -3,7 +3,7 @@
|
|||
Includes: v_cmp_class_f32, v_cmp_class_f16, v_cmp_eq_*, v_cmp_lt_*, v_cmp_gt_*
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import *
|
||||
from test.amd.hw.helpers import *
|
||||
|
||||
VCC = 106 # SGPR index for VCC_LO
|
||||
|
||||
|
|
@ -7,8 +7,8 @@ VOPD executes two operations simultaneously. Key behavior:
|
|||
- Op Y can use ops 0-18 (includes ADD_NC_U32, LSHLREV, AND)
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.test.hw.helpers import run_program, v, v_mov_b32_e32
|
||||
from extra.assembly.amd.autogen.rdna3.ins import VOPD, VOPD_LIT, VOPDOp
|
||||
from test.amd.hw.helpers import run_program, v, v_mov_b32_e32
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import VOPD, VOPD_LIT, VOPDOp
|
||||
|
||||
class TestVOPDBasic(unittest.TestCase):
|
||||
"""Basic VOPD functionality tests."""
|
||||
|
|
@ -108,7 +108,7 @@ class TestVOPDLiterals(unittest.TestCase):
|
|||
Tests that the 32-bit literal (SIMM32) is correctly passed to the instruction.
|
||||
fma(2.0, 3.0, 10.0) = 2*3 + 10 = 16.0
|
||||
"""
|
||||
from extra.assembly.amd.test.hw.helpers import f2i, i2f
|
||||
from test.amd.hw.helpers import f2i, i2f
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
|
||||
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0
|
||||
|
|
@ -126,7 +126,7 @@ class TestVOPDLiterals(unittest.TestCase):
|
|||
Tests that the 32-bit literal (SIMM32) is correctly used as the multiplier.
|
||||
fma(2.0, 5.0, 3.0) = 2*5 + 3 = 13.0
|
||||
"""
|
||||
from extra.assembly.amd.test.hw.helpers import f2i, i2f
|
||||
from test.amd.hw.helpers import f2i, i2f
|
||||
instructions = [
|
||||
v_mov_b32_e32(v[0], f2i(2.0)), # v[0] = 2.0
|
||||
v_mov_b32_e32(v[1], f2i(3.0)), # v[1] = 3.0
|
||||
|
|
@ -2,10 +2,10 @@
|
|||
import unittest, ctypes
|
||||
from dataclasses import dataclass
|
||||
|
||||
from extra.assembly.amd.emu import WaveState, decode_program, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
|
||||
from extra.assembly.amd import decode_inst
|
||||
from extra.assembly.amd.test.helpers import KernelInfo
|
||||
from extra.assembly.amd.test.bench_emu import REMU_PATH
|
||||
from tinygrad.renderer.amd.emu import WaveState, decode_program, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
|
||||
from tinygrad.renderer.amd import decode_inst
|
||||
from test.amd.helpers import KernelInfo
|
||||
from test.amd.bench_emu import REMU_PATH
|
||||
|
||||
def set_valid_mem_ranges(ranges): pass # emu2 doesn't need this
|
||||
|
||||
|
|
@ -5,9 +5,9 @@ from tinygrad.uop.ops import UOp, Ops, KernelInfo
|
|||
from tinygrad.renderer import Estimates
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler
|
||||
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from extra.assembly.amd.dsl import s, v, Inst
|
||||
from extra.assembly.amd.test.disasm import disasm as disasm_inst
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
from tinygrad.renderer.amd.dsl import s, v, Inst
|
||||
from test.amd.disasm import disasm as disasm_inst
|
||||
|
||||
def assemble_insts(insts:list[Inst], name:str, arch:str, kernarg_size:int=8) -> tuple[UOp, UOp]:
|
||||
kd = {"kernarg_size":kernarg_size, "user_sgpr_kernarg_segment_ptr":1, "next_free_vgpr":8, "next_free_sgpr":8, "wavefront_size32":1}
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
import unittest
|
||||
from extra.assembly.amd.dsl import *
|
||||
from extra.assembly.amd.dsl import VDSTYField
|
||||
from extra.assembly.amd.autogen.rdna3.enum import VOP1Op, VOP2Op
|
||||
from extra.assembly.amd.autogen.rdna3.ins import VOP1
|
||||
from tinygrad.renderer.amd.dsl import *
|
||||
from tinygrad.renderer.amd.dsl import VDSTYField
|
||||
from tinygrad.runtime.autogen.amd.rdna3.enum import VOP1Op, VOP2Op
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import VOP1
|
||||
|
||||
class TestRegisters(unittest.TestCase):
|
||||
def test_vgpr_single(self):
|
||||
|
|
@ -4,10 +4,10 @@ from collections import defaultdict
|
|||
from tinygrad.helpers import DEBUG
|
||||
from tinygrad.dtype import dtypes
|
||||
from tinygrad.uop.ops import UOp, Ops
|
||||
from extra.assembly.amd.emu import parse_pcode
|
||||
from extra.assembly.amd.pcode import parse_expr
|
||||
from extra.assembly.amd.autogen.rdna3.str_pcode import PCODE
|
||||
from extra.assembly.amd.autogen.rdna3.enum import VOP1Op, VOP2Op, SOP2Op, DSOp
|
||||
from tinygrad.renderer.amd.emu import parse_pcode
|
||||
from tinygrad.renderer.amd.pcode import parse_expr
|
||||
from tinygrad.runtime.autogen.amd.rdna3.str_pcode import PCODE
|
||||
from tinygrad.runtime.autogen.amd.rdna3.enum import VOP1Op, VOP2Op, SOP2Op, DSOp
|
||||
|
||||
def _srcs():
|
||||
"""Create minimal source variables for pcode parsing."""
|
||||
|
|
@ -325,15 +325,15 @@ class TestAllPcode(unittest.TestCase):
|
|||
self.assertGreaterEqual(pct, min_pct, f"[{arch}] {pct:.1f}% < {min_pct}% threshold")
|
||||
|
||||
def test_parse_all_cdna_pcode(self):
|
||||
from extra.assembly.amd.autogen.cdna.str_pcode import PCODE as CDNA_PCODE
|
||||
from tinygrad.runtime.autogen.amd.cdna.str_pcode import PCODE as CDNA_PCODE
|
||||
self._parse_all_pcode(CDNA_PCODE, "CDNA", min_pct=60)
|
||||
|
||||
def test_parse_all_rdna3_pcode(self):
|
||||
from extra.assembly.amd.autogen.rdna3.str_pcode import PCODE as RDNA3_PCODE
|
||||
from tinygrad.runtime.autogen.amd.rdna3.str_pcode import PCODE as RDNA3_PCODE
|
||||
self._parse_all_pcode(RDNA3_PCODE, "RDNA3", min_pct=90)
|
||||
|
||||
def test_parse_all_rdna4_pcode(self):
|
||||
from extra.assembly.amd.autogen.rdna4.str_pcode import PCODE as RDNA4_PCODE
|
||||
from tinygrad.runtime.autogen.amd.rdna4.str_pcode import PCODE as RDNA4_PCODE
|
||||
self._parse_all_pcode(RDNA4_PCODE, "RDNA4", min_pct=65)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
@ -4,10 +4,10 @@
|
|||
Note: Graphics-only formats (EXP, MUBUF, MTBUF, MIMG) are not supported - use GLOBAL/FLAT for memory access in compute.
|
||||
"""
|
||||
import unittest
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from extra.assembly.amd.dsl import VCC_HI, EXEC_LO, NULL
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
from tinygrad.renderer.amd.dsl import VCC_HI, EXEC_LO, NULL
|
||||
OFF = NULL # OFF is alias for NULL
|
||||
from extra.assembly.amd import detect_format
|
||||
from tinygrad.renderer.amd import detect_format
|
||||
|
||||
|
||||
class TestDS(unittest.TestCase):
|
||||
|
|
@ -2,10 +2,10 @@
|
|||
# the Inst constructor should be looking at the types of the fields to correctly set the value
|
||||
|
||||
import unittest, struct
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from extra.assembly.amd.dsl import Inst
|
||||
from extra.assembly.amd.test.test_roundtrip import compile_asm
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
from tinygrad.renderer.amd.dsl import Inst
|
||||
from test.amd.test_roundtrip import compile_asm
|
||||
from test.amd.disasm import disasm
|
||||
|
||||
class IntegrationTestBase(unittest.TestCase):
|
||||
inst: Inst
|
||||
|
|
@ -133,11 +133,11 @@ class TestIntegrationCDNA(IntegrationTestBase):
|
|||
arch = "cdna"
|
||||
|
||||
def test_mfma(self):
|
||||
from extra.assembly.amd.autogen.cdna.ins import v_mfma_f32_16x16x16_f16
|
||||
from tinygrad.runtime.autogen.amd.cdna.ins import v_mfma_f32_16x16x16_f16
|
||||
self.inst = v_mfma_f32_16x16x16_f16(v[0:3], v[0:1], v[0:1], 0)
|
||||
|
||||
def test_mfma_fp8(self):
|
||||
from extra.assembly.amd.autogen.cdna.ins import v_mfma_f32_16x16x128_f8f6f4
|
||||
from tinygrad.runtime.autogen.amd.cdna.ins import v_mfma_f32_16x16x128_f8f6f4
|
||||
self.inst = v_mfma_f32_16x16x128_f8f6f4(v[0:3], v[0:5], v[0:5], 1, cbsz=2, blgp=2)
|
||||
|
||||
class TestRegisterSliceSyntax(unittest.TestCase):
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Integration test: round-trip RDNA3 assembly through AMD toolchain."""
|
||||
import unittest, io, sys
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
|
||||
def waitcnt(vmcnt: int = 0x3f, expcnt: int = 0x7, lgkmcnt: int = 0x3f) -> int:
|
||||
return (expcnt & 0x7) | ((lgkmcnt & 0x3f) << 4) | ((vmcnt & 0x3f) << 10)
|
||||
|
|
@ -10,9 +10,9 @@ Only compute-relevant instruction formats are tested. Graphics-only formats not
|
|||
"""
|
||||
import unittest, re, subprocess, functools
|
||||
from tinygrad.helpers import fetch
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from extra.assembly.amd import decode_inst, detect_format
|
||||
from extra.assembly.amd.test.helpers import get_llvm_mc, get_target, get_mattr
|
||||
from test.amd.disasm import disasm
|
||||
from tinygrad.renderer.amd import decode_inst, detect_format
|
||||
from test.amd.helpers import get_llvm_mc, get_target, get_mattr
|
||||
|
||||
LLVM_BASE = "https://raw.githubusercontent.com/llvm/llvm-project/llvmorg-21.1.0/llvm/test/MC/AMDGPU"
|
||||
|
||||
|
|
@ -127,9 +127,9 @@ def _make_test(f: str, arch: str, test_type: str):
|
|||
self.assertEqual(skipped, 0, f"{name}: {skipped} tests skipped, expected 0")
|
||||
elif test_type == "repr":
|
||||
# Test that eval(repr(inst)) reproduces the instruction
|
||||
if arch == "rdna3": import extra.assembly.amd.autogen.rdna3.ins as ins # type: ignore[no-redef]
|
||||
elif arch == "rdna4": import extra.assembly.amd.autogen.rdna4.ins as ins # type: ignore[no-redef]
|
||||
elif arch == "cdna": import extra.assembly.amd.autogen.cdna.ins as ins # type: ignore[no-redef]
|
||||
if arch == "rdna3": import tinygrad.runtime.autogen.amd.rdna3.ins as ins # type: ignore[no-redef]
|
||||
elif arch == "rdna4": import tinygrad.runtime.autogen.amd.rdna4.ins as ins # type: ignore[no-redef]
|
||||
elif arch == "cdna": import tinygrad.runtime.autogen.amd.cdna.ins as ins # type: ignore[no-redef]
|
||||
ns = {k: getattr(ins, k) for k in dir(ins) if not k.startswith('_')}
|
||||
passed, skipped = 0, 0
|
||||
for _, data in tests:
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Test PDF pseudocode extraction from generate.py."""
|
||||
import unittest
|
||||
from extra.assembly.amd.generate import extract_pdf_text, extract_pcode, parse_xml, ARCHS, FIXES
|
||||
from tinygrad.renderer.amd.generate import extract_pdf_text, extract_pcode, parse_xml, ARCHS, FIXES
|
||||
|
||||
EXPECTED_PAGES = {"rdna3": 655, "rdna4": 711, "cdna": 610}
|
||||
|
||||
|
|
@ -1,8 +1,8 @@
|
|||
#!/usr/bin/env python3
|
||||
import unittest, subprocess
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from extra.assembly.amd.test.helpers import get_llvm_mc
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
from test.amd.helpers import get_llvm_mc
|
||||
from test.amd.disasm import disasm
|
||||
|
||||
def llvm_assemble(asm: str) -> bytes:
|
||||
"""Assemble using llvm-mc and return bytes."""
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
import unittest, ctypes
|
||||
from extra.assembly.amd.autogen.rdna4 import ins as ir4
|
||||
from extra.assembly.amd.dsl import v, s
|
||||
from extra.assembly.amd.emu import WaveState, decode_program
|
||||
from tinygrad.runtime.autogen.amd.rdna4 import ins as ir4
|
||||
from tinygrad.renderer.amd.dsl import v, s
|
||||
from tinygrad.renderer.amd.emu import WaveState, decode_program
|
||||
from tinygrad.device import Buffer, BufferSpec
|
||||
from tinygrad.dtype import dtypes
|
||||
|
||||
|
|
@ -1,9 +1,9 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match."""
|
||||
import unittest, io, sys, re, subprocess, os
|
||||
from extra.assembly.amd import detect_format
|
||||
from extra.assembly.amd.test.helpers import get_llvm_mc, get_llvm_objdump, get_target, get_mattr
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from tinygrad.renderer.amd import detect_format
|
||||
from test.amd.helpers import get_llvm_mc, get_llvm_objdump, get_target, get_mattr
|
||||
from test.amd.disasm import disasm
|
||||
|
||||
def disassemble_lib(lib: bytes, compiler) -> list[tuple[str, bytes]]:
|
||||
"""Disassemble ELF binary and return list of (instruction_text, machine_code_bytes)."""
|
||||
|
|
@ -82,7 +82,7 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
|
|||
"""
|
||||
arch = self.arch
|
||||
|
||||
from extra.assembly.amd.test.test_compare_emulators import get_kernels_from_tinygrad
|
||||
from test.amd.test_compare_emulators import get_kernels_from_tinygrad
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
|
||||
from tinygrad.helpers import AMD_LLVM
|
||||
|
|
@ -5,15 +5,16 @@ from pathlib import Path
|
|||
from tinygrad.helpers import DEBUG
|
||||
from tinygrad.runtime.autogen import rocprof
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from extra.assembly.amd import decode_inst
|
||||
from extra.assembly.amd.autogen.rdna3.ins import SOPP
|
||||
from extra.assembly.amd.autogen.rdna3.enum import SOPPOp
|
||||
from extra.assembly.amd.sqtt import (decode, LAYOUT_HEADER, WAVESTART, WAVESTART_RDNA4, WAVEEND, INST, INST_RDNA4, VALUINST,
|
||||
from tinygrad.renderer.amd import decode_inst
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP
|
||||
from tinygrad.runtime.autogen.amd.rdna3.enum import SOPPOp
|
||||
from tinygrad.renderer.amd.sqtt import (decode, LAYOUT_HEADER, WAVESTART, WAVESTART_RDNA4, WAVEEND, INST, INST_RDNA4, VALUINST,
|
||||
IMMEDIATE, IMMEDIATE_MASK, PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4,
|
||||
InstOp, InstOpRDNA4, print_packets)
|
||||
from extra.assembly.amd.test.helpers import TARGET_TO_ARCH
|
||||
from test.amd.helpers import TARGET_TO_ARCH
|
||||
|
||||
EXAMPLES_DIR = Path(__file__).parent.parent.parent.parent / "sqtt/examples"
|
||||
import tinygrad
|
||||
EXAMPLES_DIR = Path(tinygrad.__file__).parent.parent / "extra/sqtt/examples"
|
||||
# INST ops for non-traced SIMDs (excluded from instruction count)
|
||||
OTHER_SIMD_OPS = {InstOp.OTHER_LDS_LOAD, InstOp.OTHER_LDS_STORE, InstOp.OTHER_LDS_STORE_64, InstOp.OTHER_LDS_STORE_128,
|
||||
InstOp.OTHER_FLAT_LOAD, InstOp.OTHER_FLAT_STORE, InstOp.OTHER_FLAT_STORE_64, InstOp.OTHER_FLAT_STORE_96,
|
||||
|
|
@ -3,7 +3,8 @@ import unittest, struct, ctypes, pickle
|
|||
from pathlib import Path
|
||||
|
||||
ROCPROF_LIB = Path("/usr/lib/librocprof-trace-decoder.so")
|
||||
EXAMPLES_DIR = Path(__file__).parent.parent.parent.parent / "sqtt/examples"
|
||||
import tinygrad
|
||||
EXAMPLES_DIR = Path(tinygrad.__file__).parent.parent / "extra/sqtt/examples"
|
||||
|
||||
# CDNA pkt_fmt -> size in bytes (extracted from rocprof hash table)
|
||||
CDNA_PKT_SIZES = {0: 2, 1: 8, 2: 8, 3: 4, 4: 2, 5: 6, 6: 2, 7: 2, 8: 2, 9: 2, 10: 2, 11: 8, 12: 6, 13: 4, 14: 8, 15: 6}
|
||||
|
|
@ -79,7 +80,7 @@ def extract_packet_encodings():
|
|||
|
||||
def extract_cdna_packet_sizes():
|
||||
"""Extract CDNA pkt_fmt -> size mapping by running rocprof decoder to populate its hash table."""
|
||||
from extra.assembly.amd.test.test_sqtt_examples import run_rocprof_decoder
|
||||
from test.amd.test_sqtt_examples import run_rocprof_decoder
|
||||
|
||||
if not (pkl_path := next((EXAMPLES_DIR / "gfx950").glob("*.pkl"), None)): return None
|
||||
with open(pkl_path, "rb") as f: data = pickle.load(f)
|
||||
|
|
@ -124,7 +125,7 @@ class TestSQTTMatchesBinary(unittest.TestCase):
|
|||
with self.subTest(pkt_fmt=pkt_fmt): self.assertEqual(pkt_sizes.get(pkt_fmt), size)
|
||||
|
||||
def test_cdna_packet_definitions(self):
|
||||
from extra.assembly.amd.sqtt import PACKET_TYPES_CDNA
|
||||
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_CDNA
|
||||
for pkt_fmt, pkt_cls in PACKET_TYPES_CDNA.items():
|
||||
with self.subTest(packet=pkt_cls.__name__):
|
||||
self.assertEqual(pkt_cls.encoding.default, pkt_fmt)
|
||||
|
|
@ -132,21 +133,21 @@ class TestSQTTMatchesBinary(unittest.TestCase):
|
|||
|
||||
def _test_bit_counts(self, layout: int):
|
||||
if not (tables := extract_bit_tables()): self.skipTest("rocprof-trace-decoder not installed")
|
||||
from extra.assembly.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
|
||||
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
|
||||
for type_id, pkt_cls in {3: PACKET_TYPES_RDNA3, 4: PACKET_TYPES_RDNA4}[layout].items():
|
||||
with self.subTest(packet=pkt_cls.__name__):
|
||||
self.assertEqual(pkt_cls._size_nibbles * 4, tables[layout - 2][type_id]) # type: ignore[attr-defined]
|
||||
|
||||
def _test_encodings(self, layout: int):
|
||||
if not (encodings := extract_packet_encodings()): self.skipTest("rocprof-trace-decoder not installed")
|
||||
from extra.assembly.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
|
||||
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
|
||||
for type_id, pkt_cls in {3: PACKET_TYPES_RDNA3, 4: PACKET_TYPES_RDNA4}[layout].items():
|
||||
with self.subTest(packet=pkt_cls.__name__):
|
||||
self.assertEqual((pkt_cls.encoding.mask, pkt_cls.encoding.default), encodings[layout - 2][type_id])
|
||||
|
||||
def _test_delta_fields(self, layout: int):
|
||||
if not (deltas := extract_delta_fields()): self.skipTest("rocprof-trace-decoder not installed")
|
||||
from extra.assembly.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
|
||||
from tinygrad.renderer.amd.sqtt import PACKET_TYPES_RDNA3, PACKET_TYPES_RDNA4
|
||||
for type_id, pkt_cls in {3: PACKET_TYPES_RDNA3, 4: PACKET_TYPES_RDNA4}[layout].items():
|
||||
if type_id not in deltas[layout - 2]: continue
|
||||
delta = getattr(pkt_cls, 'delta', None)
|
||||
|
|
@ -3,11 +3,12 @@ import unittest, pickle
|
|||
from typing import Iterator
|
||||
from pathlib import Path
|
||||
from tinygrad.helpers import DEBUG
|
||||
from extra.assembly.amd.sqtt import print_packets, map_insts
|
||||
from extra.assembly.amd.autogen.rdna3.ins import s_endpgm
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from tinygrad.renderer.amd.sqtt import print_packets, map_insts
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import s_endpgm
|
||||
from test.amd.disasm import disasm
|
||||
|
||||
EXAMPLES_DIR = Path(__file__).parent.parent.parent.parent / "sqtt/examples"
|
||||
import tinygrad
|
||||
EXAMPLES_DIR = Path(tinygrad.__file__).parent.parent / "extra/sqtt/examples"
|
||||
|
||||
def rocprof_inst_traces_match(sqtt, prg, target):
|
||||
from tinygrad.viz.serve import amd_decode
|
||||
|
|
@ -1,11 +1,11 @@
|
|||
# ruff: noqa: F405
|
||||
import unittest, subprocess, os
|
||||
from extra.assembly.amd.autogen.rdna3.ins import * # noqa: F403
|
||||
from extra.assembly.amd.dsl import s, v, Inst, NULL
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import * # noqa: F403
|
||||
from tinygrad.renderer.amd.dsl import s, v, Inst, NULL
|
||||
|
||||
def assemble_kernel(insts:list[Inst], name:str="test") -> str:
|
||||
kd = {"next_free_vgpr": 8, "next_free_sgpr": 8, "wavefront_size32": 1, "user_sgpr_kernarg_segment_ptr": 1, "kernarg_size": 8}
|
||||
from extra.assembly.amd.test.disasm import disasm as _disasm
|
||||
from test.amd.disasm import disasm as _disasm
|
||||
disasm = "\n".join(_disasm(inst) for inst in insts)
|
||||
hsasrc = f".text\n.globl {name}\n.p2align 8\n.type {name},@function\n{name}:\n{disasm}\n"
|
||||
return hsasrc + f".rodata\n.p2align 6\n.amdhsa_kernel {name}\n" + "\n".join(f".amdhsa_{k} {v}" for k, v in kd.items()) + "\n.end_amdhsa_kernel"
|
||||
|
|
|
|||
6
test/external/external_test_gpu_crash.py
vendored
6
test/external/external_test_gpu_crash.py
vendored
|
|
@ -6,8 +6,8 @@ Run with: AMD=1 python -m pytest test/external/external_test_gpu_crash.py -v
|
|||
"""
|
||||
import unittest, re
|
||||
from tinygrad.device import Device
|
||||
from extra.assembly.amd.autogen.rdna3.ins import * # noqa: F403
|
||||
from extra.assembly.amd.dsl import s, v, Inst, NULL
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import * # noqa: F403
|
||||
from tinygrad.renderer.amd.dsl import s, v, Inst, NULL
|
||||
|
||||
def assemble(code:str, name:str="test") -> str:
|
||||
kd = {"next_free_vgpr": 8, "next_free_sgpr": 8, "wavefront_size32": 1, "user_sgpr_kernarg_segment_ptr": 1, "kernarg_size": 8}
|
||||
|
|
@ -37,7 +37,7 @@ class TestGPUCrash(unittest.TestCase):
|
|||
prg(self.dev.allocator.alloc(64), global_size=(1,1,1), local_size=(1,1,1), wait=True)
|
||||
|
||||
def _run_insts(self, insts: list[Inst]):
|
||||
from extra.assembly.amd.test.disasm import disasm
|
||||
from test.amd.disasm import disasm
|
||||
self._run("\n".join(disasm(i) for i in insts))
|
||||
|
||||
def _assert_gpu_fault(self, func):
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ class PythonRemu:
|
|||
arch: str = "rdna3" # Architecture: rdna3 or rdna4
|
||||
|
||||
def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int:
|
||||
from extra.assembly.amd.emu import run_asm
|
||||
from tinygrad.renderer.amd.emu import run_asm
|
||||
return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2, self.scratch_size, self.arch)
|
||||
|
||||
def _try_dlopen_remu():
|
||||
|
|
|
|||
|
|
@ -10,8 +10,8 @@ from tinygrad.device import Compiler
|
|||
from tinygrad.runtime.support.compiler_amd import HIPCompiler
|
||||
from tinygrad.viz.serve import amdgpu_cfg
|
||||
|
||||
from extra.assembly.amd.autogen.rdna3.ins import *
|
||||
from extra.assembly.amd.dsl import s
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
from tinygrad.renderer.amd.dsl import s
|
||||
|
||||
# TODO: this belongs to the dsl infrastructure
|
||||
from extra.gemm.amd_asm_matmul import Kernel
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
An integrated environment for AMD GPU assembly and emulation
|
||||
|
||||
Test with `PYTHONPATH="." pytest -n12 extra/assembly/amd/`
|
||||
`AMD_LLVM=1 PYTHONPATH="." pytest -n12 extra/assembly/amd/`
|
||||
Test with `pytest -n12 test/amd/`
|
||||
`AMD_LLVM=1 pytest -n12 test/amd/`
|
||||
|
||||
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
|
||||
* emu.py -- an emulator for RDNA that runs in tinygrad with `AMD=1 MOCKGPU=1 PYTHON_REMU=1`
|
||||
|
|
@ -11,7 +11,7 @@ Test with `PYTHONPATH="." pytest -n12 extra/assembly/amd/`
|
|||
|
||||
The code should be as readable and deduplicated as possible. asm and emu shouldn't be required for dsl.
|
||||
|
||||
The autogen folder is autogenerated from the AMD PDFs with `python3 -m extra.assembly.amd.pdf --arch all`
|
||||
The autogen folder is autogenerated from the AMD PDFs with `python3 -m tinygrad.renderer.amd.pdf --arch all`
|
||||
|
||||
test_emu.py has a good set of instruction tests for the emulation, with USE_HW=1 it will compare to real hardware.
|
||||
Whenever an instruction is fixed, regression tests should be added here and confirmed with real hardware.
|
||||
|
|
@ -20,20 +20,20 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
|
|||
|
||||
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
|
||||
|
||||
`PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
|
||||
The ops tests also pass, but they are very slow, so you should run them one at a time.
|
||||
|
||||
`SKIP_SLOW_TEST=1 PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 PYTHONPATH="." AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=0 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 AMD=1 PYTHON_REMU=1 MOCKGPU=1 AMD_LLVM=1 pytest -n=12 test/backend/test_ops.py`
|
||||
|
||||
When something is caught by main tinygrad tests, a local regression test should be added to `extra/assembly/amd/test`.
|
||||
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
|
||||
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
|
||||
If a test is failing with `AMD=1 PYTHON_REMU=1 MOCKGPU=1` it's because an instruction is emulated incorrectly.
|
||||
You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
|
||||
IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.
|
||||
|
||||
Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.
|
||||
Get line count with `cloc --by-file extra/assembly/amd/*.py`
|
||||
Get line count with `cloc --by-file tinygrad/renderer/amd/*.py`
|
||||
|
||||
72
tinygrad/renderer/amd/__init__.py
Normal file
72
tinygrad/renderer/amd/__init__.py
Normal file
|
|
@ -0,0 +1,72 @@
|
|||
# Instruction format detection and decoding
|
||||
from __future__ import annotations
|
||||
from tinygrad.renderer.amd.dsl import Inst, FixedBitField, EnumBitField
|
||||
|
||||
# SDWA/DPP variant detection: src0 field (bits 0-8) encodes the variant
|
||||
# 0xf9 (249) = SDWA, 0xfa (250) = DPP16 for CDNA (GFX9)
|
||||
_VARIANT_SRC0 = {"_SDWA_SDST": 0xf9, "_SDWA": 0xf9, "_DPP16": 0xfa}
|
||||
|
||||
def _matches(data: bytes, cls: type[Inst]) -> bool:
|
||||
"""Check if data matches all FixedBitFields and op is in allowed."""
|
||||
for _, field in cls._fields:
|
||||
dword_idx = field.lo // 32
|
||||
if len(data) < (dword_idx + 1) * 4: return False
|
||||
word = int.from_bytes(data[dword_idx*4:(dword_idx+1)*4], 'little')
|
||||
field_lo = field.lo % 32
|
||||
if isinstance(field, FixedBitField):
|
||||
if ((word >> field_lo) & field.mask) != field.default: return False
|
||||
if isinstance(field, EnumBitField) and field.allowed is not None:
|
||||
try: opcode = field.decode((word >> field_lo) & field.mask)
|
||||
except ValueError: return False # opcode not in enum
|
||||
if opcode not in field.allowed: return False
|
||||
# Check SDWA/DPP variant based on src0 field (bits 0-8) - only for variant classes
|
||||
name = cls.__name__
|
||||
word = int.from_bytes(data[:4], 'little')
|
||||
for suffix, expected_src0 in _VARIANT_SRC0.items():
|
||||
if name.endswith(suffix): return (word & 0x1ff) == expected_src0
|
||||
return True
|
||||
|
||||
# Lazy-load instruction format tables to avoid circular imports (ins.py imports dsl.py which is in this package)
|
||||
_FORMATS: dict[str, list[type[Inst]]] | None = None
|
||||
|
||||
def _load_formats() -> dict[str, list[type[Inst]]]:
|
||||
global _FORMATS
|
||||
if _FORMATS is not None: return _FORMATS
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import (VOP1, VOP1_SDST, VOP1_LIT, VOP2, VOP2_LIT, VOP3, VOP3_SDST, VOP3SD, VOP3P, VOPC, VOPD,
|
||||
VINTERP, SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, SMEM, DS, FLAT, GLOBAL, SCRATCH)
|
||||
from tinygrad.runtime.autogen.amd.rdna4.ins import (VOP1 as R4_VOP1, VOP1_SDST as R4_VOP1_SDST, VOP1_LIT as R4_VOP1_LIT,
|
||||
VOP2 as R4_VOP2, VOP2_LIT as R4_VOP2_LIT, VOP3 as R4_VOP3, VOP3_SDST as R4_VOP3_SDST, VOP3SD as R4_VOP3SD, VOP3P as R4_VOP3P,
|
||||
VOPC as R4_VOPC, VOPD as R4_VOPD, VINTERP as R4_VINTERP, SOP1 as R4_SOP1, SOP1_LIT as R4_SOP1_LIT,
|
||||
SOP2 as R4_SOP2, SOP2_LIT as R4_SOP2_LIT, SOPC as R4_SOPC, SOPC_LIT as R4_SOPC_LIT,
|
||||
SOPK as R4_SOPK, SOPK_LIT as R4_SOPK_LIT, SOPP as R4_SOPP,
|
||||
SMEM as R4_SMEM, DS as R4_DS, VFLAT as R4_FLAT, VGLOBAL as R4_GLOBAL, VSCRATCH as R4_SCRATCH)
|
||||
from tinygrad.runtime.autogen.amd.cdna.ins import (VOP1 as C_VOP1, VOP1_SDWA as C_VOP1_SDWA, VOP1_DPP16 as C_VOP1_DPP16,
|
||||
VOP2 as C_VOP2, VOP2_LIT as C_VOP2_LIT, VOP2_SDWA as C_VOP2_SDWA, VOP2_DPP16 as C_VOP2_DPP16,
|
||||
VOPC as C_VOPC, VOPC_SDWA_SDST as C_VOPC_SDWA_SDST,
|
||||
VOP3 as C_VOP3, VOP3_SDST as C_VOP3_SDST, VOP3SD as C_VOP3SD, VOP3P as C_VOP3P, VOP3P_MFMA as C_VOP3P_MFMA, VOP3PX2 as C_VOP3PX2,
|
||||
SOP1 as C_SOP1, SOP2 as C_SOP2, SOPC as C_SOPC, SOPK as C_SOPK, SOPK_LIT as C_SOPK_LIT, SOPP as C_SOPP, SMEM as C_SMEM, DS as C_DS,
|
||||
FLAT as C_FLAT, GLOBAL as C_GLOBAL, SCRATCH as C_SCRATCH, MUBUF as C_MUBUF)
|
||||
# Order matters: more specific encodings first, catch-alls (SOP2, VOP2) last
|
||||
# Order: base before _LIT (base matches regular ops, _LIT catches lit-only ops excluded from base)
|
||||
_FORMATS = {
|
||||
"rdna3": [VOPD, VOP3P, VINTERP, VOP3SD, VOP3_SDST, VOP3, DS, GLOBAL, SCRATCH, FLAT, SMEM,
|
||||
SOP1, SOP1_LIT, SOP2, SOP2_LIT, SOPC, SOPK, SOPK_LIT, SOPP, VOPC, VOP1_SDST, VOP1, VOP1_LIT, VOP2, VOP2_LIT],
|
||||
"rdna4": [R4_VOPD, R4_VOP3P, R4_VINTERP, R4_VOP3SD, R4_VOP3_SDST, R4_VOP3, R4_DS, R4_GLOBAL, R4_SCRATCH, R4_FLAT, R4_SMEM,
|
||||
R4_SOP1, R4_SOP1_LIT, R4_SOPC, R4_SOPC_LIT, R4_SOPP, R4_SOPK, R4_SOPK_LIT, R4_VOPC, R4_VOP1_SDST, R4_VOP1, R4_VOP1_LIT,
|
||||
R4_SOP2, R4_SOP2_LIT, R4_VOP2, R4_VOP2_LIT],
|
||||
"cdna": [C_VOP3PX2, C_VOP3P_MFMA, C_VOP3P, C_VOP3SD, C_VOP3_SDST, C_VOP3, C_DS, C_GLOBAL, C_SCRATCH, C_FLAT, C_MUBUF, C_SMEM,
|
||||
C_SOP1, C_SOPC, C_SOPP, C_SOPK, C_SOPK_LIT, C_VOPC_SDWA_SDST, C_VOPC,
|
||||
C_VOP1_DPP16, C_VOP1_SDWA, C_VOP1, C_VOP2_DPP16, C_VOP2_SDWA, C_SOP2, C_VOP2, C_VOP2_LIT],
|
||||
}
|
||||
return _FORMATS
|
||||
|
||||
def detect_format(data: bytes, arch: str = "rdna3") -> type[Inst]:
|
||||
"""Detect instruction format from machine code bytes."""
|
||||
assert len(data) >= 4, f"need at least 4 bytes, got {len(data)}"
|
||||
for cls in _load_formats()[arch]:
|
||||
if _matches(data, cls): return cls
|
||||
raise ValueError(f"unknown {arch} format word={int.from_bytes(data[:4], 'little'):#010x}")
|
||||
|
||||
def decode_inst(data: bytes, arch: str = "rdna3") -> Inst:
|
||||
"""Decode machine code bytes into an instruction."""
|
||||
return detect_format(data, arch).from_bytes(data)
|
||||
|
|
@ -236,9 +236,9 @@ class VDSTYField(BitField):
|
|||
# ══════════════════════════════════════════════════════════════
|
||||
|
||||
import functools
|
||||
from extra.assembly.amd.autogen.rdna3.operands import OPERANDS as OPERANDS_RDNA3
|
||||
from extra.assembly.amd.autogen.rdna4.operands import OPERANDS as OPERANDS_RDNA4
|
||||
from extra.assembly.amd.autogen.cdna.operands import OPERANDS as OPERANDS_CDNA
|
||||
from tinygrad.runtime.autogen.amd.rdna3.operands import OPERANDS as OPERANDS_RDNA3
|
||||
from tinygrad.runtime.autogen.amd.rdna4.operands import OPERANDS as OPERANDS_RDNA4
|
||||
from tinygrad.runtime.autogen.amd.cdna.operands import OPERANDS as OPERANDS_CDNA
|
||||
OPERANDS = {**OPERANDS_CDNA, **OPERANDS_RDNA3, **OPERANDS_RDNA4}
|
||||
|
||||
# ══════════════════════════════════════════════════════════════
|
||||
|
|
@ -48,14 +48,14 @@ from tinygrad.runtime.autogen import hsa
|
|||
from tinygrad.helpers import Context, DEBUG, colored
|
||||
from tinygrad.engine.realize import get_runner
|
||||
|
||||
from extra.assembly.amd import decode_inst
|
||||
from extra.assembly.amd.autogen.rdna3.str_pcode import PCODE as PCODE_RDNA3
|
||||
from extra.assembly.amd.autogen.rdna4.str_pcode import PCODE as PCODE_RDNA4
|
||||
from extra.assembly.amd.autogen.rdna3 import ins as ir3
|
||||
from extra.assembly.amd.autogen.rdna4 import ins as ir4
|
||||
from extra.assembly.amd.dsl import VCC_LO, EXEC_LO, SCC, ttmp
|
||||
from extra.assembly.amd.autogen.common import Fmt, OpType
|
||||
from extra.assembly.amd.pcode import parse_block, _FUNCS
|
||||
from tinygrad.renderer.amd import decode_inst
|
||||
from tinygrad.runtime.autogen.amd.rdna3.str_pcode import PCODE as PCODE_RDNA3
|
||||
from tinygrad.runtime.autogen.amd.rdna4.str_pcode import PCODE as PCODE_RDNA4
|
||||
from tinygrad.runtime.autogen.amd.rdna3 import ins as ir3
|
||||
from tinygrad.runtime.autogen.amd.rdna4 import ins as ir4
|
||||
from tinygrad.renderer.amd.dsl import VCC_LO, EXEC_LO, SCC, ttmp
|
||||
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
|
||||
from tinygrad.renderer.amd.pcode import parse_block, _FUNCS
|
||||
|
||||
MASK32 = 0xFFFFFFFF
|
||||
|
||||
|
|
@ -265,7 +265,7 @@ def write_common(all_fmts: dict[str, int], all_op_types: set[str], path: pathlib
|
|||
|
||||
def write_enum(enums, path):
|
||||
lines: list[str] = ["# autogenerated from AMD ISA XML - do not edit",
|
||||
"from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401", ""]
|
||||
"from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401", ""]
|
||||
for name, ops in sorted(enums.items()):
|
||||
if not ops: continue
|
||||
suffix = "_E32" if name in ("VOP1", "VOP2", "VOPC") else "_E64" if name == "VOP3" else ""
|
||||
|
|
@ -344,8 +344,8 @@ def write_ins(encodings, enums, suffix_only_ops, types, arch, path):
|
|||
"SCC", "VCCZ", "EXECZ", "ttmp", "INV_2PI", "SDWA", "DPP", "DPP16", "LIT", "SRC_LDS_DIRECT"]
|
||||
dsl_reexport = sorted(set(dsl_names + _DSL_REGS))
|
||||
lines: list[str] = ["# autogenerated from AMD ISA XML - do not edit", "# ruff: noqa: E501,F401",
|
||||
f"from extra.assembly.amd.dsl import {', '.join(dsl_reexport)}",
|
||||
f"from extra.assembly.amd.autogen.{arch}.enum import {', '.join(enum_names)}", "import functools", ""]
|
||||
f"from tinygrad.renderer.amd.dsl import {', '.join(dsl_reexport)}",
|
||||
f"from tinygrad.runtime.autogen.amd.{arch}.enum import {', '.join(enum_names)}", "import functools", ""]
|
||||
|
||||
def fmt_allowed(op_enum: str, ops: set[int]) -> str:
|
||||
"""Format allowed ops as {EnumName.MEMBER, ...}."""
|
||||
|
|
@ -456,8 +456,8 @@ def write_operands(types: dict, enums: dict, arch: str, path: pathlib.Path) -> N
|
|||
used_bases = {eb for (nm, eb) in types if (nm, eb) in valid}
|
||||
enum_names = sorted(f"{k}Op" for k in used_bases)
|
||||
lines: list[str] = ["# autogenerated from AMD ISA XML - do not edit",
|
||||
"from extra.assembly.amd.autogen.common import Fmt, OpType",
|
||||
f"from extra.assembly.amd.autogen.{arch}.enum import {', '.join(enum_names)}", ""]
|
||||
"from tinygrad.runtime.autogen.amd.common import Fmt, OpType",
|
||||
f"from tinygrad.runtime.autogen.amd.{arch}.enum import {', '.join(enum_names)}", ""]
|
||||
lines.append("# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}")
|
||||
lines.append("OPERANDS = {")
|
||||
def fmt_val(v):
|
||||
|
|
@ -481,7 +481,7 @@ def write_pcode(pcode: dict[tuple[str, int], str], enums: dict[str, dict[int, st
|
|||
entries.append((f"{fmt_name}Op", f"{name}{msuf}", opcode, pcode[(name, opcode)]))
|
||||
enum_names = sorted(set(e[0] for e in entries))
|
||||
lines = ["# autogenerated from AMD ISA PDF - do not edit", "# ruff: noqa: E501",
|
||||
f"from extra.assembly.amd.autogen.{arch}.enum import {', '.join(enum_names)}", "", "PCODE = {"]
|
||||
f"from tinygrad.runtime.autogen.amd.{arch}.enum import {', '.join(enum_names)}", "", "PCODE = {"]
|
||||
for enum_name, name, opcode, code in sorted(entries, key=lambda x: (x[0], x[2])):
|
||||
lines.append(f" {enum_name}.{name}: {code!r},")
|
||||
lines.append("}")
|
||||
|
|
@ -508,12 +508,13 @@ if __name__ == "__main__":
|
|||
all_fmts[fmt] = bits
|
||||
all_op_types.update(op_types_set)
|
||||
# Write common.py
|
||||
common_path = pathlib.Path(__file__).parent / "autogen" / "common.py"
|
||||
autogen_base = pathlib.Path(__file__).parents[2] / "runtime" / "autogen" / "amd"
|
||||
common_path = autogen_base / "common.py"
|
||||
write_common(all_fmts, all_op_types, common_path)
|
||||
print(f"Wrote common.py: {len(all_fmts)} formats, {len(all_op_types)} op types")
|
||||
# Write per-arch files from XML
|
||||
for arch, data in arch_data.items():
|
||||
base = pathlib.Path(__file__).parent / "autogen" / arch
|
||||
base = autogen_base / arch
|
||||
write_enum(data["enums"], base / "enum.py")
|
||||
write_ins(data["encodings"], data["enums"], data["suffix_only_ops"], data["types"], arch, base / "ins.py")
|
||||
write_operands(data["types"], data["enums"], arch, base / "operands.py")
|
||||
|
|
@ -524,6 +525,6 @@ if __name__ == "__main__":
|
|||
pages = extract_pdf_text(cfg["pdf"])
|
||||
name_to_op = {name: op for ops in arch_data[arch]["enums"].values() for op, name in ops.items()}
|
||||
pcode = extract_pcode(pages, name_to_op)
|
||||
base = pathlib.Path(__file__).parent / "autogen" / arch
|
||||
base = autogen_base / arch
|
||||
write_pcode(pcode, arch_data[arch]["enums"], arch, base / "str_pcode.py")
|
||||
print(f" {arch}: {len(pcode)} pcode entries")
|
||||
|
|
@ -8,9 +8,9 @@ from __future__ import annotations
|
|||
from dataclasses import dataclass
|
||||
from typing import Iterator
|
||||
from enum import Enum
|
||||
from extra.assembly.amd.dsl import BitField, FixedBitField, Inst, bits
|
||||
from extra.assembly.amd.autogen.rdna3.ins import SOPP, s_endpgm
|
||||
from extra.assembly.amd.autogen.rdna3.enum import SOPPOp
|
||||
from tinygrad.renderer.amd.dsl import BitField, FixedBitField, Inst, bits
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import SOPP, s_endpgm
|
||||
from tinygrad.runtime.autogen.amd.rdna3.enum import SOPPOp
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════════════════════
|
||||
# FIELD ENUMS
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
|
||||
from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
|
||||
|
||||
class DSOp(ReprEnum):
|
||||
DS_ADD_U32 = 0
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
# ruff: noqa: E501,F401
|
||||
from extra.assembly.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VGPRField, s, src, ttmp, v
|
||||
from extra.assembly.amd.autogen.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp, HWREG
|
||||
from tinygrad.renderer.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VGPRField, s, src, ttmp, v
|
||||
from tinygrad.runtime.autogen.amd.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp, HWREG
|
||||
import functools
|
||||
|
||||
class DS(Inst):
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
from extra.assembly.amd.autogen.common import Fmt, OpType
|
||||
from extra.assembly.amd.autogen.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp
|
||||
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
|
||||
from tinygrad.runtime.autogen.amd.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3PX2Op, VOP3SDOp, VOPCOp
|
||||
|
||||
# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}
|
||||
OPERANDS = {
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
# autogenerated from AMD ISA PDF - do not edit
|
||||
# ruff: noqa: E501
|
||||
from extra.assembly.amd.autogen.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
|
||||
from tinygrad.runtime.autogen.amd.cdna.enum import DSOp, FLATOp, GLOBALOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
|
||||
|
||||
PCODE = {
|
||||
DSOp.DS_ADD_U32: 'addr = CalcDsAddr(ADDR.b32, OFFSET0.b32, OFFSET1.b32);\ntmp = MEM[addr].u32;\nMEM[addr].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
|
||||
from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
|
||||
|
||||
class DSOp(ReprEnum):
|
||||
DS_ADD_U32 = 0
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
# ruff: noqa: E501,F401
|
||||
from extra.assembly.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
|
||||
from extra.assembly.amd.autogen.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, HWREG, MSG
|
||||
from tinygrad.renderer.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SRsrcField, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
|
||||
from tinygrad.runtime.autogen.amd.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, HWREG, MSG
|
||||
import functools
|
||||
|
||||
class DS(Inst):
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
from extra.assembly.amd.autogen.common import Fmt, OpType
|
||||
from extra.assembly.amd.autogen.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp
|
||||
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
|
||||
from tinygrad.runtime.autogen.amd.rdna3.enum import DSOp, EXPOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp
|
||||
|
||||
# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}
|
||||
OPERANDS = {
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
# autogenerated from AMD ISA PDF - do not edit
|
||||
# ruff: noqa: E501
|
||||
from extra.assembly.amd.autogen.rdna3.enum import DSOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
|
||||
from tinygrad.runtime.autogen.amd.rdna3.enum import DSOp, FLATOp, GLOBALOp, LDSDIROp, MIMGOp, MTBUFOp, MUBUFOp, SCRATCHOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp
|
||||
|
||||
PCODE = {
|
||||
DSOp.DS_ADD_U32: 'tmp = MEM[ADDR].u32;\nMEM[ADDR].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',
|
||||
|
|
@ -1,5 +1,5 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
from extra.assembly.amd.autogen.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
|
||||
from tinygrad.runtime.autogen.amd.common import ReprEnum, Fmt, FMT_BITS, OpType # noqa: F401
|
||||
|
||||
class DSOp(ReprEnum):
|
||||
DS_ADD_U32 = 0
|
||||
|
|
@ -1,7 +1,7 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
# ruff: noqa: E501,F401
|
||||
from extra.assembly.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
|
||||
from extra.assembly.amd.autogen.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp, HWREG, MSG
|
||||
from tinygrad.renderer.amd.dsl import BitField, DPP, DPP16, EXEC, EXECZ, EXEC_HI, EXEC_LO, EnumBitField, FixedBitField, INV_2PI, Inst, LIT, M0, NULL, OFF, SBaseField, SCC, SDWA, SGPRField, SRC_LDS_DIRECT, SSrcField, SrcField, VCC, VCCZ, VCC_HI, VCC_LO, VDSTYField, VGPRField, s, src, ttmp, v
|
||||
from tinygrad.runtime.autogen.amd.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp, HWREG, MSG
|
||||
import functools
|
||||
|
||||
class DS(Inst):
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
# autogenerated from AMD ISA XML - do not edit
|
||||
from extra.assembly.amd.autogen.common import Fmt, OpType
|
||||
from extra.assembly.amd.autogen.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
|
||||
from tinygrad.runtime.autogen.amd.common import Fmt, OpType
|
||||
from tinygrad.runtime.autogen.amd.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VDSDIROp, VEXPORTOp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
|
||||
|
||||
# instruction operand info: {Op: {field: (Fmt, size_bits, OpType)}}
|
||||
OPERANDS = {
|
||||
|
|
@ -1,6 +1,6 @@
|
|||
# autogenerated from AMD ISA PDF - do not edit
|
||||
# ruff: noqa: E501
|
||||
from extra.assembly.amd.autogen.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
|
||||
from tinygrad.runtime.autogen.amd.rdna4.enum import DSOp, SMEMOp, SOP1Op, SOP2Op, SOPCOp, SOPKOp, SOPPOp, VBUFFEROp, VFLATOp, VGLOBALOp, VIMAGEOp, VINTERPOp, VOP1Op, VOP2Op, VOP3Op, VOP3POp, VOP3SDOp, VOPCOp, VOPDOp, VSAMPLEOp, VSCRATCHOp
|
||||
|
||||
PCODE = {
|
||||
DSOp.DS_ADD_U32: 'addr = CalcDsAddr(vgpr_a.b32, offset.b32);\ntmp = MEM[addr].u32;\nMEM[addr].u32 += DATA.u32;\nRETURN_DATA.u32 = tmp',
|
||||
|
|
@ -309,7 +309,7 @@ def load_counters(profile:list[ProfileEvent]) -> None:
|
|||
ctxs.append({"name":f"Exec {name}"+(f" n{run_number[k]}" if run_number[k] > 1 else ""), "steps":steps})
|
||||
|
||||
def sqtt_timeline(data:bytes, lib:bytes, target:int) -> list[ProfileEvent]:
|
||||
from extra.assembly.amd.sqtt import map_insts, InstructionInfo, PacketType, INST, InstOp, VALUINST, IMMEDIATE, IMMEDIATE_MASK, VMEMEXEC, ALUEXEC
|
||||
from tinygrad.renderer.amd.sqtt import map_insts, InstructionInfo, PacketType, INST, InstOp, VALUINST, IMMEDIATE, IMMEDIATE_MASK, VMEMEXEC, ALUEXEC
|
||||
ret:list[ProfileEvent] = []
|
||||
rows:dict[str, None] = {}
|
||||
trace:dict[str, set[int]] = {}
|
||||
|
|
@ -430,10 +430,10 @@ def amd_readelf(lib:bytes) -> list[dict]:
|
|||
return [{"label":f"{resource} Alloc", "value":val} for resource,val in [("VGPR", (vgpr_gran+1)*8-7), ("LDS",kd.group_segment_fixed_size),
|
||||
("Scratch", kd.private_segment_fixed_size)] if val > 0]
|
||||
|
||||
def amd_decode(lib:bytes, target:int) -> dict[int, Any]: # Any is the Inst class from extra.assembly.amd.dsl
|
||||
def amd_decode(lib:bytes, target:int) -> dict[int, Any]: # Any is the Inst class from tinygrad.renderer.amd.dsl
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from extra.assembly.amd import detect_format
|
||||
from extra.assembly.amd.dsl import Inst
|
||||
from tinygrad.renderer.amd import detect_format
|
||||
from tinygrad.renderer.amd.dsl import Inst
|
||||
image, sections, _ = elf_loader(lib)
|
||||
text = next((sh for sh in sections if sh.name == ".text"), None)
|
||||
assert text is not None, "no .text section found in ELF"
|
||||
|
|
@ -486,7 +486,7 @@ def amdgpu_cfg(lib:bytes, target:int) -> dict:
|
|||
else: paths[curr].update([(nx+offset, COND_TAKEN), (nx, COND_NOT_TAKEN)])
|
||||
elif nx in leaders: paths[curr][nx] = UNCOND
|
||||
pc_tokens:dict[int, list[dict]] = {}
|
||||
from extra.assembly.amd.dsl import Reg
|
||||
from tinygrad.renderer.amd.dsl import Reg
|
||||
for pc, inst in pc_table.items():
|
||||
pc_tokens[pc] = tokens = []
|
||||
for name, field in inst._fields:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue