remove REMU from tree (#15706)

* no more compare emulators

* remove remu from tree
This commit is contained in:
George Hotz 2026-04-13 20:43:08 +08:00 committed by GitHub
commit 16f50a40a5
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
22 changed files with 77 additions and 5732 deletions

View file

@ -225,14 +225,12 @@ runs:
if: inputs.amd == 'true' && runner.os == 'Linux'
shell: bash
run: |
cargo build --release --manifest-path ./extra/remu/Cargo.toml
sudo ln -sf ${{ github.workspace }}/extra/remu/target/release/libremu.so /usr/local/lib/libremu.so
sudo tee --append /etc/ld.so.conf.d/rocm.conf <<'EOF'
/opt/rocm/lib
/opt/rocm/lib64
EOF
sudo ldconfig
- name: Setup AMD comgr+remu (macOS)
- name: Setup AMD comgr (macOS)
if: inputs.amd == 'true' && runner.os == 'macOS'
shell: bash
run: |
@ -240,7 +238,6 @@ runs:
curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/tinygrad/amdcomgr_dylib/releases/latest | \
jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
sudo xargs curl -fL -o /usr/local/lib/libamd_comgr.dylib
cargo build --release --manifest-path ./extra/remu/Cargo.toml
# **** gpuocelot ****

View file

@ -71,10 +71,6 @@ jobs:
uv venv /tmp/tinygrad_pytest_ci
source /tmp/tinygrad_pytest_ci/bin/activate
uv pip install .[testing]
- name: setup other stuff
run: |
mkdir -p extra/remu/target/release/
ln -s ~/tinygrad/extra/remu/target/release/libremu.so extra/remu/target/release/libremu.so
- name: setup staging db
run: |
echo "CACHEDB=/tmp/pytest-db-ci.db" >> $GITHUB_ENV

View file

@ -644,7 +644,6 @@ jobs:
timeout-minutes: 20
env:
DEV: AMD
PYTHON_REMU: 1
MOCKGPU: 1
steps:
- name: Checkout Code

66
extra/remu/Cargo.lock generated
View file

@ -1,66 +0,0 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "autocfg"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "crunchy"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
[[package]]
name = "float-cmp"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4"
dependencies = [
"num-traits",
]
[[package]]
name = "half"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
dependencies = [
"cfg-if",
"crunchy",
"num-traits",
]
[[package]]
name = "libm"
version = "0.2.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
[[package]]
name = "num-traits"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
dependencies = [
"autocfg",
"libm",
]
[[package]]
name = "remu"
version = "0.1.0"
dependencies = [
"float-cmp",
"half",
"num-traits",
]

View file

@ -1,15 +0,0 @@
[package]
name = "remu"
version = "0.1.0"
edition = "2021"
rust-version = "1.80.0"
[lib]
crate-type = ["cdylib"]
[dependencies]
half = { version = "2.3.1", features = ["num-traits"] }
num-traits = "0.2.17"
[dev-dependencies]
float-cmp = "0.9.0"

View file

@ -1,80 +0,0 @@
## Intro
Remu is an RDNA3 emulator built to test correctness of RDNA3 code. It is used in [tinygrad's AMD CI](https://github.com/tinygrad/tinygrad).
Most of the common instructions are implemented, but some formats like IMG are not supported.
Remu is only for testing correctness of program output, it is not a cycle accurate simulator.
## Build Locally
Remu is written in Rust. Make sure you have [Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html).
To build the project, run:
```bash
cargo build --release --manifest-path ./extra/remu/Cargo.toml
```
This will produce a binary in the `extra/remu/target/release` directory.
## Usage with tinygrad
The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally).
Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `DEV=AMD:LLVM` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
`PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu.
Add `DEBUG=6` to see Remu's logs.
### DEBUG output
Remu runs each thread one at a time in a nested for loop, see lib.rs. The DEBUG output prints information about the current thread.
The DEBUG output has 3 sections:
```
<------------ 1 ----------> <--- 2 ---> <--------------------------------------- 3 ------------------------------------------>
[0 0 0 ] [0 0 0 ] 0 F4080100 SMEM { op: 2, sdata: 4, sbase: 0, offset: 0, soffset: 124, glc: false, dlc: false }
```
#### Section 1: Grid info
`[gid.x, gid.y, gid.z], [lid.x, lid.y, lid.z]` of the current thread.
#### Section 2: Wave info
`<lane> <instruction hex>`
RDNA3 divides threads into chunks of 32. Each thread is assigned to a "lane" from 0-31.
In Remu, even though all threads run one at a time, each 32 thread chunk (a wave) shares state like SGPR, VGPR, LDS, EXEC mask, etc.
Remu can simulate up to one wave sync instruction.
For more details, see work_group.rs.
Section 2 can have a green or gray color.
Green = The thread is actively executing the instruction.
Gray = The thread has been "turned off" by the EXEC mask, it skips execution of some instructions. (refer to "EXECute Mask" on [page 23](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf#page=23) of ISA docs for more details.)
To see the colors in action, try running `DEBUG=6 PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_ops.py TestOps.test_arange_big`. See how only lane 0 writes to global memory:
```
[255 0 0 ] [0 0 0 ] 0 DC6A0000 FLAT { op: 26, offset: 0, dlc: false, glc: false, slc: false, seg: 2, addr: 8, data: 0, saddr: 0, sve: false, vdst: 0 }
[255 0 0 ] [1 0 0 ] 1 DC6A0000
[255 0 0 ] [2 0 0 ] 2 DC6A0000
[255 0 0 ] [3 0 0 ] 3 DC6A0000
[255 0 0 ] [3 0 0 ] 4 DC6A0000
```
#### Section 3: Decoded Instruction
This prints the instruction type and all the parsed bitfields.
Remu output vs llvm-objdump:
```
s_load_b64 s[0:1], s[0:1], 0x10 // 00000000160C: F4040000 F8000010
SMEM { op: 1, sdata: 0, sbase: 0, offset: 16, soffset: 124, glc: false, dlc: false }
```

View file

@ -1 +0,0 @@
max_width = 150

View file

@ -1,162 +0,0 @@
use half::f16;
use num_traits::{float::FloatCore, PrimInt, Unsigned, clamp};
pub fn bits<T>(word: T, hi: usize, lo: usize) -> T where T: PrimInt + Unsigned {
assert!(hi >= lo);
let width = hi - lo + 1;
(word >> lo) & ((T::one() << width) - T::one())
}
pub fn nth(val: u32, pos: usize) -> u32 {
(val >> (31 - pos as u32)) & 1
}
pub fn f16_lo(val: u32) -> f16 {
f16::from_bits((val & 0xffff) as u16)
}
pub fn f16_hi(val: u32) -> f16 {
f16::from_bits(((val >> 16) & 0xffff) as u16)
}
pub fn sign_ext(num: u64, bits: usize) -> i64 {
let mut value = num;
let is_negative = (value >> (bits - 1)) & 1 != 0;
if is_negative {
value |= !0 << bits;
}
value as i64
}
pub trait IEEEClass<T> {
fn exponent(&self) -> T;
}
impl IEEEClass<u32> for f32 {
fn exponent(&self) -> u32 {
(self.to_bits() & 0b01111111100000000000000000000000) >> 23
}
}
impl IEEEClass<u16> for f16 {
fn exponent(&self) -> u16 {
(self.to_bits() & 0b0111110000000000) >> 10
}
}
impl IEEEClass<u64> for f64 {
fn exponent(&self) -> u64 {
(self.to_bits() & 0b0111111111110000000000000000000000000000000000000000000000000000) >> 52
}
}
pub trait VOPModifier<T> {
fn negate(&self, pos: usize, modifier: usize) -> T;
fn absolute(&self, pos: usize, modifier: usize) -> T;
fn clmp(&self, cm: bool) -> T;
}
impl<T> VOPModifier<T> for T
where
T: FloatCore,
{
fn negate(&self, pos: usize, modifier: usize) -> T {
match (modifier >> pos) & 1 {
1 => -*self,
_ => *self,
}
}
fn absolute(&self, pos: usize, modifier: usize) -> T {
match (modifier >> pos) & 1 {
1 => self.abs(),
_ => *self,
}
}
fn clmp(&self, cm:bool) -> T {
if !cm { return *self }
let r = clamp(*self, T::zero(), T::one());
if r == T::zero() { T::zero() } else { r }
}
}
pub fn extract_mantissa(x: f64) -> f64 {
if x.is_infinite() || x.is_nan() {
return x;
}
let bits = x.to_bits();
let mantissa_mask: u64 = 0x000FFFFFFFFFFFFF;
let bias: u64 = 1023;
let normalized_mantissa_bits = (bits & mantissa_mask) | ((bias - 1) << 52);
return f64::from_bits(normalized_mantissa_bits);
}
pub fn ldexp(x: f64, exp: i32) -> f64 {
x * 2f64.powi(exp)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_mantissa() {
assert_eq!(extract_mantissa(2.0f64), 0.5);
}
#[test]
fn test_normal_exponent() {
assert_eq!(2.5f32.exponent(), 128);
assert_eq!(1.17549435e-38f32.exponent(), 1);
assert_eq!(f32::INFINITY.exponent(), 255);
assert_eq!(f32::NEG_INFINITY.exponent(), 255);
}
#[test]
fn test_denormal_exponent() {
assert_eq!(1.0e-40f32.exponent(), 0);
assert_eq!(1.0e-42f32.exponent(), 0);
assert_eq!(1.0e-44f32.exponent(), 0);
assert_eq!((1.17549435e-38f32 / 2.0).exponent(), 0);
}
#[test]
fn test_normal_exponent_f16() {
assert_eq!(f16::from_f32(3.14f32).exponent(), 16);
assert_eq!(f16::NEG_INFINITY.exponent(), 31);
assert_eq!(f16::INFINITY.exponent(), 31);
}
#[test]
fn test_neg() {
assert_eq!(0.3_f32.negate(0, 0b001), -0.3_f32);
assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
assert_eq!(0.3_f32.negate(2, 0b100), -0.3_f32);
assert_eq!(0.3_f32.negate(0, 0b110), 0.3_f32);
assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
assert_eq!(0.0_f32.negate(0, 0b001).to_bits(), (-0.0f32).to_bits());
assert_eq!((-0.0_f32).negate(0, 0b001).to_bits(), 0);
}
#[test]
fn test_sign_ext() {
assert_eq!(sign_ext(0b000000000000000101000, 21), 40);
assert_eq!(sign_ext(0b111111111111111011000, 21), -40);
assert_eq!(sign_ext(0b000000000000000000000, 21), 0);
assert_eq!(sign_ext(0b111111111111111111111, 21), -1);
assert_eq!(sign_ext(0b111000000000000000000, 21), -262144);
assert_eq!(sign_ext(0b000111111111111111111, 21), 262143);
assert_eq!(sign_ext(7608, 13), -584);
}
}
use std::sync::LazyLock;
pub static DEBUG: LazyLock<bool> = LazyLock::new(|| std::env::var("DEBUG").map(|v| v.parse::<usize>().unwrap_or(0) >= 6).unwrap_or(false));
pub fn colored(st:&str, color:&str) -> String {
let ansi_code = match color {
"green" => format!("\x1b[{};2;39;176;139m", 38),
"gray" => format!("\x1b[{};2;169;169;169m", 38),
_ => format!("\x1b[{};2;255;255;255m", 38),
};
format!("{}{}{}", ansi_code, st, "\x1b[0m")
}
#[macro_export]
macro_rules! todo_instr {
($x:expr) => {{
println!("{:08X}", $x);
Err(1)
}};
}

View file

@ -1,77 +0,0 @@
use crate::state::StateSnapshot;
use crate::work_group::{WaveContext, WorkGroup};
use std::os::raw::c_char;
use std::slice;
mod helpers;
mod rdna3;
mod state;
mod thread;
mod work_group;
#[no_mangle]
pub extern "C" fn run_asm(lib: *const c_char, lib_sz: u32, gx: u32, gy: u32, gz: u32, lx: u32, ly: u32, lz: u32, args_ptr: *const u64) -> i32 {
if lib.is_null() || (lib_sz % 4) != 0 {
panic!("Pointer is null or length is not properly aligned to 4 bytes");
}
let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() };
let dispatch_dim = match (gy != 1, gz != 1) {
(true, true) => 3,
(true, false) => 2,
_ => 1,
};
for gx in 0..gx {
for gy in 0..gy {
for gz in 0..gz {
let mut wg = WorkGroup::new(dispatch_dim, [gx, gy, gz], [lx, ly, lz], &kernel, args_ptr);
if let Err(err) = wg.exec_waves() {
return err;
}
}
}
}
0
}
// FFI functions for single-stepping comparison tests
#[no_mangle]
pub extern "C" fn wave_create(lib: *const c_char, lib_sz: u32, n_lanes: u32) -> *mut WaveContext {
if lib.is_null() || (lib_sz % 4) != 0 { return std::ptr::null_mut(); }
let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() };
Box::into_raw(Box::new(WaveContext::new(kernel, n_lanes as usize)))
}
#[no_mangle]
pub extern "C" fn wave_step(ctx: *mut WaveContext) -> i32 {
if ctx.is_null() { return -99; }
unsafe { (*ctx).step() }
}
#[no_mangle]
pub extern "C" fn wave_get_snapshot(ctx: *const WaveContext, out: *mut StateSnapshot) {
if ctx.is_null() || out.is_null() { return; }
unsafe { *out = (*ctx).get_snapshot(); }
}
#[no_mangle]
pub extern "C" fn wave_set_sgpr(ctx: *mut WaveContext, idx: u32, val: u32) {
if ctx.is_null() || idx >= 128 { return; }
unsafe { (*ctx).scalar_reg[idx as usize] = val; }
}
#[no_mangle]
pub extern "C" fn wave_set_vgpr(ctx: *mut WaveContext, lane: u32, idx: u32, val: u32) {
if ctx.is_null() || lane >= 32 || idx >= 256 { return; }
unsafe { (*ctx).vec_reg.get_lane_mut(lane as usize)[idx as usize] = val; }
}
#[no_mangle]
pub extern "C" fn wave_init_lds(ctx: *mut WaveContext, size: u32) {
if ctx.is_null() { return; }
unsafe { (*ctx).lds.data.resize(size as usize, 0); }
}
#[no_mangle]
pub extern "C" fn wave_free(ctx: *mut WaveContext) {
if !ctx.is_null() { unsafe { drop(Box::from_raw(ctx)); } }
}

View file

@ -1,223 +0,0 @@
use crate::helpers::{bits, sign_ext};
#[derive(Debug, PartialEq)]
pub enum Instruction {
SOP2 { op: u8, ssrc0: u8, ssrc1: u8, sdst: u8 },
SOP1 { op: u8, ssrc0: u8, sdst: u8 },
SOPK { op: u8, simm16: i16, sdst: u8 },
SOPP { op: u8, simm16: i16 },
SOPC { op: u8, ssrc0: u8, ssrc1: u8 },
SMEM { op: u8, sdata: u8, sbase: u8, offset: i32, soffset: u8, glc: bool, dlc: bool },
VOP1 { op: u8, vdst: u8, src: u16 },
VOP2 { op: u8, vdst: u8, vsrc: u8, src: u16 },
VOPC { op: u8, vsrc: u8, src: u16 },
VOP3 { op: u32, opsel: u8, cm: bool, abs: u8, vdst: u8, neg: u8, omod: u8, src2: u16, src1: u16, src0: u16 },
VOP3SD { op: u32, cm: bool, sdst: u8, vdst: u8, neg: u8, omod: u8, src2: u16, src1: u16, src0: u16 },
VOP3P { op: u8, vdst: u8, neg_hi: u8, opsel: u8, opsel_hi: u8, opsel_hi2: bool, cm: bool, src2: u16, src1: u16, src0: u16, neg: u8 },
VOPD { opx: u8, opy: u8, vdstx: u8, vdsty: u8, vsrcx1: u8, vsrcy1: u8, srcx0: u16, srcy0: u16 },
DS { op: u8, gds: bool, offset1: u8, offset0: u8, vdst: u8, data1: u8, data0: u8, addr: u8 },
FLAT { op: u8, offset: u16, dlc: bool, glc: bool, slc: bool, seg: u8, addr: u8, data: u8, saddr: u8, sve: bool, vdst: u8 }
}
const VOP3SD_OPS: [u32; 7] = [764, 765, 766, 767, 768, 769, 770];
pub fn decode(word:u32, word1:Option<&u32>) -> Instruction {
match bits(word, 31, 30) {
0b11 => {
let word = (*word1.unwrap() as u64) << 32 | (word as u64);
match bits(word, 29, 26) {
0b1101 => {
let sbase = (bits(word, 5, 0) as u8) << 1;
let sdata = bits(word, 12, 6) as u8;
let dlc = bits(word, 13, 13) != 0;
let glc = bits(word, 14, 14) != 0;
let op = bits(word, 25, 18) as u8;
let offset = sign_ext(bits(word, 52, 32), 21) as i32;
let soffset = bits(word, 63, 57) as u8;
Instruction::SMEM { sbase, sdata, dlc, glc, op, offset, soffset }
}
0b0101 => {
let op = bits(word, 25, 16) as u32;
let vdst = bits(word, 7, 0) as u8;
let cm = bits(word, 15, 15) != 0;
let src0 = bits(word, 40, 32) as u16;
let src1 = bits(word, 49, 41) as u16;
let src2 = bits(word, 58, 50) as u16;
let omod = bits(word, 60, 59) as u8;
let neg = bits(word, 63, 61) as u8;
if VOP3SD_OPS.contains(&op) {
let sdst = bits(word, 14, 8) as u8;
Instruction::VOP3SD { op, vdst, sdst, cm, src0, src1, src2, omod, neg }
} else {
let abs = bits(word, 10, 8) as u8;
let opsel = bits(word, 14, 11) as u8;
Instruction::VOP3 { opsel, cm, abs, vdst, neg, omod, src2, src1, src0, op }
}
}
0b0011 => {
let op = bits(word, 22, 16) as u8;
let vdst = bits(word, 7, 0) as u8;
let neg_hi = bits(word, 10, 8) as u8;
let opsel = bits(word, 13, 11) as u8;
let opsel_hi2 = bits(word, 14, 14) != 0;
let cm = bits(word, 15, 15) != 0;
let src0 = bits(word, 40, 32) as u16;
let src1 = bits(word, 49, 41) as u16;
let src2 = bits(word, 58, 50) as u16;
let opsel_hi = bits(word, 60, 59) as u8;
let neg = bits(word, 63, 61) as u8;
Instruction::VOP3P { op, vdst, neg_hi, opsel, opsel_hi, opsel_hi2, cm, src0, src1, src2, neg }
}
0b0110 => {
let offset0 = bits(word, 7, 0) as u8;
let offset1 = bits(word, 15, 8) as u8;
let gds = bits(word, 17, 17) != 0;
let op = bits(word, 25, 18) as u8;
let addr = bits(word, 39, 32) as u8;
let data0 = bits(word, 47, 40) as u8;
let data1 = bits(word, 55, 48) as u8;
let vdst = bits(word, 63, 56) as u8;
Instruction::DS { op, gds, offset1, offset0, vdst, data1, data0, addr }
}
0b0111 => {
let offset = bits(word, 12, 0) as u16;
let dlc = bits(word, 13, 13) != 0;
let glc = bits(word, 14, 14) != 0;
let slc = bits(word, 15, 15) != 0;
let seg = bits(word, 17, 16) as u8;
let op = bits(word, 24, 18) as u8;
let addr = bits(word, 39, 32) as u8;
let data = bits(word, 47, 40) as u8;
let saddr = bits(word, 54, 48) as u8;
let sve = bits(word, 55, 55) != 0;
let vdst = bits(word, 63, 56) as u8;
Instruction::FLAT { offset, dlc, glc, slc, seg, op, addr, data, saddr, sve, vdst }
},
0b0010 => {
let srcx0 = bits(word, 8, 0) as u16;
let vsrcx1 = bits(word, 16, 9) as u8;
let opy = bits(word, 21, 17) as u8;
let opx = bits(word, 25, 22) as u8;
let srcy0 = bits(word, 40, 32) as u16;
let vsrcy1 = bits(word, 48, 41) as u8;
let vdsty = bits(word, 55, 49) as u8;
let vdstx = bits(word, 63, 56) as u8;
Instruction::VOPD { opx, opy, vdstx, vdsty, vsrcx1, vsrcy1, srcx0, srcy0 }
}
_ => todo!(),
}
}
0b10 => {
let ssrc0 = bits(word, 7, 0) as u8;
let ssrc1 = bits(word, 15, 8) as u8;
let simm16 = word as i16;
let sdst = bits(word, 22, 16) as u8;
match bits(word, 29, 23) {
0b1111101 => Instruction::SOP1 { ssrc0, sdst, op: bits(word, 15, 8) as u8 },
0b1111110 => Instruction::SOPC { ssrc0, ssrc1, op: bits(word, 22, 16) as u8 },
0b1111111 => Instruction::SOPP { simm16, op: bits(word, 22, 16) as u8 },
_ => {
match bits(word, 29, 28) {
0b11 => Instruction::SOPK { simm16, sdst, op: bits(word, 27, 23) as u8 },
_ => Instruction::SOP2 { ssrc0, ssrc1, sdst, op: bits(word, 29, 23) as u8 }
}
}
}
}
_ => {
let vdst = bits(word, 24, 17) as u8;
let src = bits(word, 8, 0) as u16;
let vsrc = bits(word, 16, 9) as u8;
match bits(word, 30, 25) {
0b111110 => Instruction::VOPC { vsrc, src, op: bits(word, 24, 17) as u8 },
0b111111 => Instruction::VOP1 { vdst, src, op: vsrc },
_ => Instruction::VOP2 { vdst, vsrc, src, op: bits(word, 30, 25) as u8 },
}
},
}
}
#[cfg(test)]
mod test_rdna3 {
use super::*;
use std::process::{Stdio, Command};
use std::io::{Result, Write};
const LLVM_ARGS: &[&str; 3] = &["--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa"];
const OFFSET_PRG: usize = 16;
const NULL: u8 = 124;
fn llvm_assemble(asm: &str) -> Result<Vec<u8>> {
let mut proc = Command::new("llvm-mc").args(LLVM_ARGS).args(["-filetype=obj", "-o", "-"]).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn()?;
proc.stdin.as_mut().unwrap().write_all(asm.as_bytes())?;
let out = proc.wait_with_output()?;
match out.status.success() {
true => Ok(out.stdout),
false => Err(std::io::Error::new(std::io::ErrorKind::Other, "llvm-mc err")),
}
}
fn llvm_disassemble(code: &Vec<u8>) -> Result<String> {
let mut proc = Command::new("llvm-objdump").args(LLVM_ARGS).args(["--disassemble", "-"]).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn()?;
proc.stdin.as_mut().unwrap().write_all(code)?;
let out = proc.wait_with_output()?;
match out.status.success() {
true => Ok(String::from_utf8(out.stdout).unwrap()),
false => Err(std::io::Error::new(std::io::ErrorKind::Other, "llvm-objdump err")),
}
}
fn test_decode(asm: &str) -> Instruction {
let lib = llvm_assemble(asm).unwrap();
println!("{}", llvm_disassemble(&lib).unwrap());
let stream: Vec<u32> = lib.chunks_exact(4).map(|chunk| u32::from_le_bytes(chunk.try_into().unwrap())).skip(OFFSET_PRG).collect();
decode(stream[0], stream.get(1))
}
#[test]
fn test_decode_smem() {
assert_eq!(test_decode("s_load_b128 s[4:7], s[0:1], null"), Instruction::SMEM { op: 2, sdata: 4, sbase: 0, offset: 0, soffset: NULL, glc: false, dlc: false });
assert_eq!(test_decode("s_load_b32 s10, s[0:1], 0xc"), Instruction::SMEM { op: 0, sdata: 10, sbase: 0, offset: 0xc, soffset: NULL, glc: false, dlc: false });
assert_eq!(test_decode("s_load_b32 s0, s[4:5], s6"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: 6, glc: false, dlc: false });
assert_eq!(test_decode("s_load_b32 s0, s[4:5], glc dlc"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: NULL, glc: true, dlc: true });
assert_eq!(test_decode("s_load_b32 s0, s[4:5], glc"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: NULL, glc: true, dlc: false });
assert_eq!(test_decode("s_load_b32 s0, s[4:5], -20"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: -20, soffset: NULL, glc: false, dlc: false });
assert_eq!(test_decode("s_load_b32 s0, s[4:5], -1048576"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: -1048576, soffset: NULL, glc: false, dlc: false });
}
#[test]
fn test_decode_salu() {
assert_eq!(test_decode("s_add_u32 s1 s2 s3"), Instruction::SOP2 { op: 0, ssrc0: 2, ssrc1: 3, sdst: 1 });
assert_eq!(test_decode("s_add_u32 vcc_hi exec_lo vcc_lo"), Instruction::SOP2 { op: 0, ssrc0: 126, ssrc1: 106, sdst: 107 });
assert_eq!(test_decode("s_mov_b32 s1 -0.5"), Instruction::SOP1 { op: 0, ssrc0: 241, sdst: 1 });
assert_eq!(test_decode("s_cmpk_eq_i32 s0 -30"), Instruction::SOPK { op: 3, sdst: 0, simm16: -30 });
assert_eq!(test_decode("s_cmpk_eq_u32 s0 65535"), Instruction::SOPK { op: 9, sdst: 0, simm16: -1 });
assert_eq!(test_decode("s_cmp_ge_i32 s1 s2"), Instruction::SOPC { op: 3, ssrc0: 1, ssrc1: 2 });
}
#[test]
fn test_decode_valu_e32() {
assert_eq!(test_decode("v_mov_b32 v0, v0"), Instruction::VOP1 { op: 1, vdst: 0, src: 256 });
assert_eq!(test_decode("v_mov_b32 v0, s0"), Instruction::VOP1 { op: 1, vdst: 0, src: 0 });
assert_eq!(test_decode("v_cmp_t_f32 v1, v0"), Instruction::VOPC { op: 31, vsrc: 0, src: 257 });
}
#[test]
fn test_decode_valu_e64() {
assert_eq!(test_decode("v_log_f32_e64 v2, |v0|"), Instruction::VOP3 { op: 423, vdst: 2, src0: 256, src1: 0, src2: 0, abs: 0b001, neg: 0, opsel: 0, omod: 0, cm: false });
assert_eq!(test_decode("v_div_scale_f32 v2, s1, v0, v1, v2"), Instruction::VOP3SD { op: 764, cm: false, vdst: 2, sdst: 1, src0: 256, src1: 257, src2: 258, omod: 0, neg: 0 });
assert_eq!(test_decode("v_pk_add_i16 v1, v0, v2"), Instruction::VOP3P { op: 2, vdst: 1, neg_hi: 0, opsel: 0, opsel_hi: 3, opsel_hi2: true, cm: false, src2: 0, src1: 258, src0: 256, neg: 0 });
}
#[test]
fn test_decode_ds() {
assert_eq!(test_decode("ds_add_u32 v2, v4 offset:16"), Instruction::DS { op: 0, gds: false, offset1: 0, offset0: 0x10, vdst: 0, data1: 0, data0: 4, addr: 2 });
assert_eq!(test_decode("ds_store_b32 v0, v1, offset: 0x04 gds"), Instruction::DS { op: 13, gds: true, offset1: 0, offset0: 0x04, vdst: 0, data1: 0, data0: 1, addr: 0 });
assert_eq!(test_decode("ds_load_u8 v1, v0 offset:16"), Instruction::DS { op: 58, gds: false, offset1: 0, offset0: 16, vdst: 1, data1: 0, data0: 0, addr: 0 });
}
}

View file

@ -1,272 +0,0 @@
use std::ops::{Index, IndexMut};
pub trait Register {
fn read64(&self, idx: usize) -> u64;
fn write64(&mut self, idx: usize, addr: u64);
}
impl<T> Register for T where T: Index<usize, Output = u32> + IndexMut<usize> {
fn read64(&self, idx: usize) -> u64 {
let lsb = self[idx] as u64;
let msb = self[idx + 1] as u64;
(msb << 32) | lsb
}
fn write64(&mut self, idx: usize, value: u64) {
self[idx] = (value & 0xffffffff) as u32;
self[idx + 1] = ((value & (0xffffffff << 32)) >> 32) as u32;
}
}
#[derive(Debug, Clone)]
pub struct VGPR {
values: [[u32; 256]; 32],
pub default_lane: Option<usize>,
}
impl Index<usize> for VGPR {
type Output = u32;
fn index(&self, index: usize) -> &Self::Output {
&self.values[self.default_lane.unwrap()][index]
}
}
impl IndexMut<usize> for VGPR {
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
&mut self.values[self.default_lane.unwrap()][index]
}
}
impl VGPR {
pub fn new() -> Self {
VGPR {
values: [[0; 256]; 32],
default_lane: None,
}
}
pub fn get_lane(&self, lane: usize) -> [u32; 256] {
*self.values.get(lane).unwrap()
}
pub fn get_lane_mut(&mut self, lane: usize) -> &mut [u32; 256] {
self.values.get_mut(lane).unwrap()
}
}
pub trait Value {
fn mut_hi16(&mut self, val: u16);
fn mut_lo16(&mut self, val: u16);
}
impl Value for u32 {
fn mut_hi16(&mut self, val: u16) {
*self = ((val as u32) << 16) | (*self as u16 as u32);
}
fn mut_lo16(&mut self, val: u16) {
*self = ((((*self & (0xffff << 16)) >> 16) as u32) << 16) | val as u32;
}
}
#[derive(Debug, Clone, Copy)]
pub struct WaveValue {
pub value: u32,
pub warp_size: usize,
pub default_lane: Option<usize>,
pub mutations: Option<[bool; 32]>,
}
impl WaveValue {
pub fn new(value: u32, warp_size: usize) -> Self {
Self {
value,
warp_size,
default_lane: None,
mutations: None,
}
}
pub fn read(&self) -> bool {
(self.value >> self.default_lane.unwrap()) & 1 == 1
}
pub fn set_lane(&mut self, value: bool) {
if self.mutations.is_none() {
self.mutations = Some([false; 32])
}
self.mutations.as_mut().unwrap()[self.default_lane.unwrap()] = value;
}
pub fn apply_muts(&mut self) {
self.value = 0;
for lane in 0..self.warp_size {
if self.mutations.unwrap()[lane] {
self.value |= 1 << lane;
}
}
}
}
/// C-compatible state snapshot for FFI - used for comparing emulator states
#[repr(C)]
#[derive(Clone, Debug)]
pub struct StateSnapshot {
pub pc: u32,
pub scc: u32,
pub vcc: u32,
pub exec_mask: u32,
pub sgpr: [u32; 128],
pub vgpr: [[u32; 256]; 32],
}
impl StateSnapshot {
pub fn new() -> Self {
Self { pc: 0, scc: 0, vcc: 0, exec_mask: 0, sgpr: [0; 128], vgpr: [[0; 256]; 32] }
}
}
#[derive(Clone, Debug)]
pub struct VecDataStore {
pub data: Vec<u8>,
}
impl VecDataStore {
pub fn new() -> Self {
Self { data: Vec::new() }
}
pub fn write(&mut self, addr: usize, val: u32) {
if addr + 4 >= self.data.len() {
self.data.resize(self.data.len() + addr + 5, 0);
}
self.data[addr..addr + 4].iter_mut().enumerate().for_each(|(i, x)| {
*x = val.to_le_bytes()[i];
});
}
pub fn write64(&mut self, addr: usize, val: u64) {
self.write(addr, (val & 0xffffffff) as u32);
self.write(addr + 4, ((val & (0xffffffff << 32)) >> 32) as u32);
}
pub fn read(&self, addr: usize) -> u32 {
let mut bytes: [u8; 4] = [0; 4];
bytes.copy_from_slice(&self.data[addr + 0..addr + 4]);
u32::from_le_bytes(bytes)
}
pub fn read64(&mut self, addr: usize) -> u64 {
let lsb = self.read(addr);
let msb = self.read(addr + 4);
((msb as u64) << 32) | lsb as u64
}
}
#[cfg(test)]
mod test_state {
use super::*;
#[test]
fn test_wave_value() {
let mut val = WaveValue::new(0b11000000000000011111111111101110, 32);
val.default_lane = Some(0);
assert!(!val.read());
val.default_lane = Some(31);
assert!(val.read());
}
#[test]
fn test_wave_value_small() {
let mut val = WaveValue::new(0, 1);
val.default_lane = Some(0);
assert!(!val.read());
assert_eq!(val.value, 0);
val.set_lane(true);
val.apply_muts();
assert!(val.read());
assert_eq!(val.value, 1);
}
#[test]
fn test_wave_value_small_alt() {
let mut val = WaveValue::new(0, 2);
val.default_lane = Some(0);
assert!(!val.read());
assert_eq!(val.value, 0);
val.set_lane(true);
val.apply_muts();
assert!(val.read());
assert_eq!(val.value, 1);
}
#[test]
fn test_wave_value_exec() {
let warp_size = 32;
let val = WaveValue::new(u32::MAX, warp_size);
assert_eq!(val.value, u32::MAX);
let warp_size = 3;
let val = WaveValue::new((1 << warp_size) - 1, warp_size);
assert_eq!(val.value, 7)
}
#[test]
fn test_wave_value_toggle_one() {
let warp_size = 2;
let mut val = WaveValue::new(0b11, warp_size);
// 0
val.default_lane = Some(0);
val.set_lane(false);
// 1
val.default_lane = Some(1);
val.set_lane(true);
val.apply_muts();
assert_eq!(val.value, 2);
}
#[test]
fn test_wave_value_mutate_small() {
let mut val = WaveValue::new(0, 2);
val.default_lane = Some(0);
assert!(!val.read());
assert_eq!(val.value, 0);
val.set_lane(true);
val.apply_muts();
assert!(val.read());
assert_eq!(val.value, 1);
}
#[test]
fn test_wave_value_mutations() {
let mut val = WaveValue::new(0b10001, 32);
val.default_lane = Some(0);
val.set_lane(false);
assert!(val.mutations.unwrap().iter().all(|x| !x));
val.default_lane = Some(1);
val.set_lane(true);
assert_eq!(val.value, 0b10001);
assert_eq!(
val.mutations,
Some([
false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
false, false, false, false, false, false, false, false, false, false, false, false, false,
])
);
val.apply_muts();
assert_eq!(val.value, 0b10);
}
#[test]
fn test_write16() {
let mut vgpr = VGPR::new();
vgpr.default_lane = Some(0);
vgpr[0] = 0b11100000000000001111111111111111;
vgpr[0].mut_lo16(0b1011101111111110);
assert_eq!(vgpr[0], 0b11100000000000001011101111111110);
}
#[test]
fn test_write16hi() {
let mut vgpr = VGPR::new();
vgpr.default_lane = Some(0);
vgpr[0] = 0b11100000000000001111111111111111;
vgpr[0].mut_hi16(0b1011101111111110);
assert_eq!(vgpr[0], 0b10111011111111101111111111111111);
}
#[test]
fn test_vgpr() {
let mut vgpr = VGPR::new();
vgpr.default_lane = Some(0);
vgpr[0] = 42;
vgpr.default_lane = Some(10);
vgpr[0] = 10;
assert_eq!(vgpr.get_lane(0)[0], 42);
assert_eq!(vgpr.get_lane(10)[0], 10);
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,323 +0,0 @@
use crate::helpers::{colored, DEBUG};
use crate::state::{Register, StateSnapshot, VecDataStore, WaveValue, VGPR};
use crate::thread::{Thread, END_PRG, SGPR_COUNT};
use std::collections::HashMap;
pub const WAVE_SIZE: usize = 32;
pub struct WorkGroup<'a> {
dispatch_dim: u32,
id: [u32; 3],
lds: VecDataStore,
kernel: &'a Vec<u32>,
kernel_args: *const u64,
launch_bounds: [u32; 3],
wave_state: HashMap<usize, WaveState>,
}
#[derive(Debug, Clone)]
struct WaveState {
scalar_reg: [u32; SGPR_COUNT],
scc: u32,
vcc: WaveValue,
exec: WaveValue,
vec_reg: VGPR,
pc: usize,
sds: HashMap<usize, VecDataStore>,
}
const SYNCS: [u32; 4] = [0xBF89FC07, 0xBC7C0000, 0xBF890007, 0xbFB60003];
const S_BARRIER: u32 = 0xBFBD0000;
/// Context for single-stepping through a wave - holds all mutable state
pub struct WaveContext {
pub kernel: Vec<u32>,
pub scalar_reg: [u32; SGPR_COUNT],
pub scc: u32,
pub pc: usize,
pub vec_reg: VGPR,
pub vcc: WaveValue,
pub exec: WaveValue,
pub lds: VecDataStore,
pub sds: HashMap<usize, VecDataStore>,
pub n_lanes: usize,
}
impl WaveContext {
pub fn new(kernel: Vec<u32>, n_lanes: usize) -> Self {
let active = (!0u32).wrapping_shr(32 - (n_lanes as u32));
Self {
kernel,
scalar_reg: [0; SGPR_COUNT],
scc: 0,
pc: 0,
vec_reg: VGPR::new(),
vcc: WaveValue::new(0, n_lanes),
exec: WaveValue::new(active, n_lanes),
lds: VecDataStore::new(),
sds: (0..=31).map(|i| (i, VecDataStore::new())).collect(),
n_lanes,
}
}
/// Execute a single instruction. Returns: 0=continue, -1=endpgm, -2=barrier, 1=done (pc past program), negative=error
pub fn step(&mut self) -> i32 {
if self.pc >= self.kernel.len() { return 1; }
if self.kernel[self.pc] == END_PRG { return -1; }
if self.kernel[self.pc] == S_BARRIER { self.pc += 1; return -2; }
// Skip sync/nop instructions
if SYNCS.contains(&self.kernel[self.pc]) || self.kernel[self.pc] >> 20 == 0xbf8 || self.kernel[self.pc] == 0x7E000000 {
self.pc += 1;
return 0;
}
let mut sgpr_co = None;
for lane_id in 0..self.n_lanes {
self.vec_reg.default_lane = Some(lane_id);
self.vcc.default_lane = Some(lane_id);
self.exec.default_lane = Some(lane_id);
let mut thread = Thread {
scalar_reg: &mut self.scalar_reg,
scc: &mut self.scc,
vec_reg: &mut self.vec_reg,
vcc: &mut self.vcc,
exec: &mut self.exec,
lds: &mut self.lds,
sds: &mut self.sds.get_mut(&lane_id).unwrap(),
pc_offset: 0,
stream: self.kernel[self.pc..].to_vec(),
scalar: false,
simm: None,
warp_size: self.n_lanes,
sgpr_co: &mut sgpr_co,
};
if let Err(e) = thread.interpret() { return e; }
if thread.scalar {
self.pc = ((self.pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
break;
}
if lane_id == self.n_lanes - 1 {
self.pc = ((self.pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
}
}
if self.vcc.mutations.is_some() { self.vcc.apply_muts(); self.vcc.mutations = None; }
if self.exec.mutations.is_some() { self.exec.apply_muts(); self.exec.mutations = None; }
if let Some((idx, mut wv)) = sgpr_co.take() { wv.apply_muts(); self.scalar_reg[idx] = wv.value; }
0
}
pub fn get_snapshot(&self) -> StateSnapshot {
let mut snap = StateSnapshot::new();
snap.pc = self.pc as u32;
snap.scc = self.scc;
snap.vcc = self.vcc.value;
snap.exec_mask = self.exec.value;
snap.sgpr = self.scalar_reg;
for lane in 0..32 { snap.vgpr[lane] = self.vec_reg.get_lane(lane); }
snap
}
}
impl<'a> WorkGroup<'a> {
pub fn new(dispatch_dim: u32, id: [u32; 3], launch_bounds: [u32; 3], kernel: &'a Vec<u32>, kernel_args: *const u64) -> Self {
Self { dispatch_dim, id, kernel, launch_bounds, kernel_args, lds: VecDataStore::new(), wave_state: HashMap::new() }
}
pub fn exec_waves(&mut self) -> Result<(), i32> {
let mut threads = vec![];
for z in 0..self.launch_bounds[2] {
for y in 0..self.launch_bounds[1] {
for x in 0..self.launch_bounds[0] {
threads.push([x, y, z])
}
}
}
let waves = threads.chunks(WAVE_SIZE).collect::<Vec<_>>();
let mut sync = false;
for (i, x) in self.kernel.iter().enumerate() {
if i != 0 && *x == S_BARRIER {
sync = true;
break;
}
}
for _ in 0..=(sync as usize) {
for w in waves.iter().enumerate() {
self.exec_wave(w)?
}
}
Ok(())
}
fn exec_wave(&mut self, (wave_id, threads): (usize, &&[[u32; 3]])) -> Result<(), i32> {
let (mut scalar_reg, mut scc, mut pc, mut vec_reg, mut vcc, mut exec, mut sds) = match self.wave_state.get(&wave_id) {
None => {
let mut scalar_reg = [0; SGPR_COUNT];
scalar_reg.write64(0, self.kernel_args as u64);
let [gx, gy, gz] = self.id;
match self.dispatch_dim {
3 => (scalar_reg[13], scalar_reg[14], scalar_reg[15]) = (gx, gy, gz),
2 => (scalar_reg[14], scalar_reg[15]) = (gx, gy),
_ => scalar_reg[15] = gx,
}
let mut vec_reg = VGPR::new();
for (t, [x, y, z]) in threads.iter().enumerate() {
vec_reg.get_lane_mut(t)[0] = match &self.launch_bounds {
[_, 1, 1] => *x,
_ => (z << 20) | (y << 10) | x,
}
}
let vcc = WaveValue::new(0, threads.len());
let active = (!0u32).wrapping_shr(32 - (threads.len() as u32));
let exec = WaveValue::new(active, threads.len());
let sds = (0..=31).map(|i| (i, VecDataStore::new())).collect();
(scalar_reg, 0, 0, vec_reg, vcc, exec, sds)
}
Some(val) => {
let val = val.clone();
(val.scalar_reg, val.scc, val.pc, val.vec_reg, val.vcc, val.exec, val.sds)
}
};
loop {
if self.kernel[pc] == END_PRG {
break Ok(());
}
if self.kernel[pc] == S_BARRIER && self.wave_state.get(&wave_id).is_none() {
self.wave_state.insert(wave_id, WaveState { scalar_reg, scc, vec_reg, vcc, exec, pc, sds });
break Ok(());
}
if self.kernel[pc] == S_BARRIER || SYNCS.contains(&self.kernel[pc]) || self.kernel[pc] >> 20 == 0xbf8 || self.kernel[pc] == 0x7E000000 {
pc += 1;
continue;
}
let mut sgpr_co = None;
for (lane_id, [x, y, z]) in threads.iter().enumerate() {
vec_reg.default_lane = Some(lane_id);
vcc.default_lane = Some(lane_id);
exec.default_lane = Some(lane_id);
if *DEBUG {
let lane = format!("{:<2} {:08X} ", lane_id, self.kernel[pc]);
let state = match exec.read() {
true => "green",
false => "gray",
};
let [id0, id1, id2] = self.id;
print!("[{id0:<3} {id1:<3} {id2:<3}] [{x:<3} {y:<3} {z:<3}] {}", colored(&lane, state));
}
let mut thread = Thread {
scalar_reg: &mut scalar_reg,
scc: &mut scc,
vec_reg: &mut vec_reg,
vcc: &mut vcc,
exec: &mut exec,
lds: &mut self.lds,
sds: &mut sds.get_mut(&lane_id).unwrap(),
pc_offset: 0,
stream: self.kernel[pc..self.kernel.len()].to_vec(),
scalar: false,
simm: None,
warp_size: threads.len(),
sgpr_co: &mut sgpr_co,
};
thread.interpret()?;
if *DEBUG {
println!();
}
if thread.scalar {
pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
break;
}
if lane_id == threads.len() - 1 {
pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
}
}
if vcc.mutations.is_some() {
vcc.apply_muts();
vcc.mutations = None;
}
if exec.mutations.is_some() {
exec.apply_muts();
exec.mutations = None;
}
if let Some((idx, mut wv)) = sgpr_co.take() {
wv.apply_muts();
scalar_reg[idx] = wv.value;
}
}
}
}
#[cfg(test)]
mod test_workgroup {
use super::*;
// TODO: make this generic by adding the assembler
fn global_store_sgpr(addr: u64, instructions: Vec<u32>, src: u32) -> Vec<u32> {
[
instructions,
vec![
0x7E020200 + src,
0x7E0402FF,
addr as u32,
0x7E0602FF,
(addr >> 32) as u32,
0xDC6A0000,
0x007C0102,
],
vec![END_PRG],
]
.concat()
}
#[test]
fn test_wave_value_state_vcc() {
let mut ret: u32 = 0;
let kernel = vec![
0xBEEA00FF,
0b11111111111111111111111111111111, // initial vcc state
0x7E140282,
0x7C94010A, // cmp blockDim.x == 2
];
let addr = (&mut ret as *mut u32) as u64;
let kernel = global_store_sgpr(addr, kernel, 106);
let mut wg = WorkGroup::new(1, [0, 0, 0], [3, 1, 1], &kernel, [addr].as_ptr());
wg.exec_waves().unwrap();
assert_eq!(ret, 0b100);
}
#[test]
fn test_wave_value_state_exec() {
let mut ret: u32 = 0;
let kernel = vec![
0xBEFE00FF,
0b11111111111111111111111111111111,
0x7E140282,
0x7D9C010A, // cmpx blockDim.x <= 2
];
let addr = (&mut ret as *mut u32) as u64;
let kernel = global_store_sgpr(addr, kernel, 126);
let mut wg = WorkGroup::new(1, [0, 0, 0], [4, 1, 1], &kernel, [addr].as_ptr());
wg.exec_waves().unwrap();
assert_eq!(ret, 0b0111);
}
#[test]
fn test_wave_value_sgpr_co() {
let mut ret: u32 = 0;
let kernel = vec![0xBE8D00FF, 0x7FFFFFFF, 0x7E1402FF, u32::MAX, 0xD700000A, 0x0002010A];
let addr = (&mut ret as *mut u32) as u64;
let kernel = global_store_sgpr(addr, kernel, 0);
let mut wg = WorkGroup::new(1, [0, 0, 0], [5, 1, 1], &kernel, [addr].as_ptr());
wg.exec_waves().unwrap();
assert_eq!(ret, 0b11110);
}
}

View file

@ -1,155 +0,0 @@
# ruff: noqa: F405, F403
# allow define from star imports
import numpy as np
import unittest
import subprocess, struct, math, functools
from tinygrad import Tensor, dtypes, Device
from tinygrad.helpers import getenv
from tinygrad.runtime.autogen.amd.rdna3.ins import *
from tinygrad.renderer.amd.asm import waitcnt
from test.testextra.test_cfg_viz import asm_kernel
def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]):
out = Tensor([0]*n_threads, dtype=dtypes.uint32).realize()
insts = [
s_load_b64(s[0:1], s[0:1], NULL),
*asm,
v_lshlrev_b32_e32(v[0], 2, v[0]),
s_waitcnt(simm16=waitcnt(lgkmcnt=0)),
#global_store_b32(v[0], v[1], s[0:1]),
global_store_b32(addr=v[0], data=vdst, saddr=s[0:1]),
s_endpgm()
]
out = Tensor.custom_kernel(out, fxn=functools.partial(asm_kernel, name="test", insts=insts, device=out.device, n_threads=n_threads))[0]
out.realize()
return out.tolist()
def f16_to_bits(x:float) -> int: return struct.unpack('<H', struct.pack('<e', x))[0]
def f32_from_bits(x:int) -> float: return struct.unpack('<f', struct.pack('<I', x))[0]
def f32_to_bits(x:float) -> int: return struct.unpack('<I', struct.pack('<f', x))[0]
@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")
class TestHW(unittest.TestCase):
def setUp(self):
if getenv("MOCKGPU"): subprocess.run(["cargo", "build", "--release", "--manifest-path", "./extra/remu/Cargo.toml"], check=True)
def test_simple_v_mov(self):
out = get_output([
v_mov_b32_e32(v[1], 2),
])
self.assertEqual(out, [2])
def test_simple_s_mov(self):
out = get_output([
s_mov_b32(s[7], 0x7fffffff),
v_mov_b32_e32(v[1], s[7]),
])
self.assertEqual(out, [0x7fffffff])
def test_exec_mov(self):
out = get_output([
v_mov_b32_e32(v[1], 42),
s_mov_b32(EXEC_LO, 0b10),
v_mov_b32_e32(v[1], 10),
s_mov_b32(EXEC_LO, 0b11),
], n_threads=2)
np.testing.assert_equal(out, [42, 10])
def test_exec_cmp_vopc(self):
out = get_output([
s_mov_b32(VCC_LO, 0), # reset vcc
v_mov_b32_e32(v[1], 42),
v_mov_b32_e32(v[2], 10),
s_mov_b32(EXEC_LO, 0b01),
v_cmp_ne_u32_e32(v[1], v[2]),
s_mov_b32(EXEC_LO, 0b11),
v_mov_b32_e32(v[1], VCC_LO),
], n_threads=2)[0]
np.testing.assert_equal(out, 1)
def test_exec_cmpx_vop3(self):
out = get_output([
s_mov_b32(EXEC_LO, 0b11),
v_mov_b32_e32(v[1], 42),
v_mov_b32_e32(v[2], 10),
s_mov_b32(EXEC_LO, 0b01),
v_cmpx_ne_u32_e32(v[1], v[2]),
s_mov_b32(s[10], EXEC_LO),
s_mov_b32(EXEC_LO, 0b11),
v_mov_b32_e32(v[1], s[10]),
], n_threads=2)[0]
np.testing.assert_equal(out & 0b11, 0b01)
def test_fmac_vop3_modifier(self):
init_state = [
v_mov_b32_e32(a:=v[1], f16_to_bits(4.0)),
v_mov_b32_e32(b:=v[2], f16_to_bits(3.0)),
v_mov_b32_e32(c:=v[3], f16_to_bits(2.0)),
]
def run_fmac(a, b): return get_output(init_state+[v_fmac_f16_e64(c, a, b)], vdst=c)[0]
self.assertEqual(run_fmac(a, b), f16_to_bits(14.0))
self.assertEqual(run_fmac(a, -b), f16_to_bits(-10.0))
self.assertEqual(run_fmac(-a, -b), f16_to_bits(14.0))
def test_s_abs_i32(self):
def check(x, y, dst=s[10], scc=0):
for reg,val in [(dst, y), (SCC, scc)]:
self.assertEqual(get_output([
s_mov_b32(dst, x),
s_abs_i32(dst, dst),
v_mov_b32_e32(v[1], reg)
])[0], val)
check(0x00000001, 0x00000001, scc=1)
check(0x7fffffff, 0x7fffffff, scc=1)
check(0x80000000, 0x80000000, scc=1)
check(0x80000001, 0x7fffffff, scc=1)
check(0x80000002, 0x7ffffffe, scc=1)
check(0xffffffff, 0x00000001, scc=1)
check(0, 0, scc=0)
def test_v_rcp_f32_neg_vop3(self):
def v_neg_rcp_f32(x:float, y:float):
out = get_output([
v_mov_b32_e32(v[2], f32_to_bits(x)),
v_rcp_f32_e64(v[2], -v[2]),
], vdst=v[2])[0]
assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
v_neg_rcp_f32(math.inf, -0.0)
v_neg_rcp_f32(-math.inf, 0.0)
v_neg_rcp_f32(0.0, -math.inf)
v_neg_rcp_f32(-0.0, math.inf)
v_neg_rcp_f32(-2.0, 0.5)
v_neg_rcp_f32(2.0, -0.5)
def test_v_cndmask_b32_neg(self):
def v_neg(x:float, y:float):
out = get_output([
v_mov_b32_e32(v[1], f32_to_bits(x)),
s_mov_b32(s[10], 1),
v_cndmask_b32_e64(v[1], v[1], -v[1], s[10]),
])[0]
assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
v_neg(-0.0, 0.0)
v_neg(0.0, -0.0)
v_neg(2.0, -2.0)
v_neg(math.inf, -math.inf)
v_neg(-math.inf, math.inf)
@unittest.skip("how does VOPD work in the dsl")
def test_v_subrev_wrap(self):
out = get_output([
#v_dual_mov_b32(v[1], 0xffffffff, v[2], 0x0),
#v_dual_mov_b32(vdstx=v[1], srcx=0xffffffff, vdsty=v[2], srcy=0x0),
#VOPD(opx=VOPDOp.V_DUAL_MOV_B32, opy=VOPDOp.V_DUAL_MOV_B32, vdstx=v[1], srcx=0xffffffff, vdsty=v[2], srcy=0x0),
v_subrev_co_u32(v[2], VCC_LO, v[2], v[1]),
], vdst=v[2])[0]
self.assertEqual(out, 0xffff_ffff)
if __name__ == "__main__":
unittest.main()

View file

@ -3,7 +3,7 @@ INSTALL_PATH="${1:-/opt/homebrew/lib}"
if [ ! -d "$INSTALL_PATH" ]; then
USER=$(whoami)
echo "No path $INSTALL_PATH. Will create. Might need your password..."
echo "You can stop now and provide any location as an argument where you want to save the libs (note, that not default locations should be in LD_LIBRARY_PATH, so tinygrad can find the libs)."
echo "You can stop now and provide any location as an argument where you want to save the library (note, that not default locations should be in LD_LIBRARY_PATH, so tinygrad can find it)."
echo "Press any key or symbol to continue..."
read -n 1 -s
@ -11,11 +11,6 @@ if [ ! -d "$INSTALL_PATH" ]; then
sudo chown -R "$USER":staff "$INSTALL_PATH"
fi
# Download libremu.dylib
curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \
jq -r '.assets[] | select(.name == "libremu.dylib").browser_download_url' | \
xargs curl -L -o $INSTALL_PATH/libremu.dylib
# Download libamd_comgr.dylib
curl -s https://api.github.com/repos/tinygrad/amdcomgr_dylib/releases/latest | \
jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \

View file

@ -1,528 +0,0 @@
# Test to compare Python and Rust RDNA3 emulators by running real tinygrad kernels
import unittest, ctypes
from dataclasses import dataclass
from pathlib import Path
from tinygrad import Device
from test.mockgpu.amd.emu import WaveState, _decode_at, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
from tinygrad.renderer.amd import decode_inst
import tinygrad
REMU_PATH = Path(tinygrad.__file__).parent.parent / "extra/remu/target/release/libremu.so"
if not REMU_PATH.exists(): REMU_PATH = Path(tinygrad.__file__).parent.parent / "extra/remu/target/release/libremu.dylib"
def set_valid_mem_ranges(ranges): pass # emu2 doesn't need this
def _is_f32_nan(bits: int) -> bool:
"""Check if 32-bit value is a NaN (exponent all 1s, mantissa non-zero)."""
return (bits & 0x7f800000) == 0x7f800000 and (bits & 0x007fffff) != 0
def _vals_equal(a: int, b: int) -> bool:
"""Compare two 32-bit values, treating all NaN bit patterns as equal."""
if a == b: return True
return _is_f32_nan(a) and _is_f32_nan(b)
@dataclass
class KernelSnapshot:
code: bytes
src: str
global_size: tuple[int, int, int]
local_size: tuple[int, int, int]
buf_idxs: list[int] # indices into shared buffer pool
buf_sizes: list[int] # sizes for each buffer index
@dataclass
class StateSnapshot:
pc: int
scc: int
vcc: int
exec_mask: int
sgpr: list[int]
vgpr: list[list[int]]
def diff(self, other: 'StateSnapshot', n_lanes: int, arrow: str = " vs ") -> list[str]:
"""Return list of differences between two states."""
diffs = []
if self.pc != other.pc: diffs.append(f"pc: {self.pc}{arrow}{other.pc}")
if self.scc != other.scc: diffs.append(f"scc: {self.scc}{arrow}{other.scc}")
if self.vcc != other.vcc: diffs.append(f"vcc: 0x{self.vcc:08x}{arrow}0x{other.vcc:08x}")
if self.exec_mask != other.exec_mask: diffs.append(f"exec: 0x{self.exec_mask:08x}{arrow}0x{other.exec_mask:08x}")
for i, (a, b) in enumerate(zip(self.sgpr, other.sgpr)):
# Skip VCC_LO/HI (106/107) and EXEC_LO/HI (126/127) as they alias vcc/exec_mask which are compared separately
if i in (106, 107, 126, 127): continue
if not _vals_equal(a, b): diffs.append(f"sgpr[{i}]: 0x{a:08x}{arrow}0x{b:08x}")
for lane in range(n_lanes):
for i, (a, b) in enumerate(zip(self.vgpr[lane], other.vgpr[lane])):
if not _vals_equal(a, b): diffs.append(f"vgpr[{lane}][{i}]: 0x{a:08x}{arrow}0x{b:08x}")
return diffs
class CStateSnapshot(ctypes.Structure):
_fields_ = [("pc", ctypes.c_uint32), ("scc", ctypes.c_uint32), ("vcc", ctypes.c_uint32), ("exec_mask", ctypes.c_uint32),
("sgpr", ctypes.c_uint32 * 128), ("vgpr", (ctypes.c_uint32 * 256) * 32)]
def to_snapshot(self) -> StateSnapshot:
return StateSnapshot(pc=self.pc, scc=self.scc, vcc=self.vcc, exec_mask=self.exec_mask,
sgpr=list(self.sgpr), vgpr=[list(self.vgpr[i]) for i in range(32)])
class RustEmulator:
def __init__(self):
self.lib = ctypes.CDLL(str(REMU_PATH))
self.lib.wave_create.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32]
self.lib.wave_create.restype = ctypes.c_void_p
self.lib.wave_step.argtypes = [ctypes.c_void_p]
self.lib.wave_step.restype = ctypes.c_int32
self.lib.wave_get_snapshot.argtypes = [ctypes.c_void_p, ctypes.POINTER(CStateSnapshot)]
self.lib.wave_set_sgpr.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32]
self.lib.wave_set_vgpr.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32]
self.lib.wave_init_lds.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
self.lib.wave_free.argtypes = [ctypes.c_void_p]
self.ctx = None
def create(self, kernel: bytes, n_lanes: int):
kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
self.ctx = self.lib.wave_create(ctypes.addressof(kernel_buf), len(kernel), n_lanes)
self._kernel_buf = kernel_buf
def step(self) -> int: return self.lib.wave_step(self.ctx)
def set_sgpr(self, idx: int, val: int): self.lib.wave_set_sgpr(self.ctx, idx, val)
def set_vgpr(self, lane: int, idx: int, val: int): self.lib.wave_set_vgpr(self.ctx, lane, idx, val)
def init_lds(self, size: int): self.lib.wave_init_lds(self.ctx, size)
def get_snapshot(self) -> StateSnapshot:
snap = CStateSnapshot()
self.lib.wave_get_snapshot(self.ctx, ctypes.byref(snap))
return snap.to_snapshot()
def free(self):
if self.ctx:
self.lib.wave_free(self.ctx)
self.ctx = None
class PythonEmulator:
def __init__(self):
self.state: WaveState | None = None
self.program: dict[int, tuple] = {} # lazily populated: pc -> (name, fxn, globals)
self.vmem_buf = None
self.lds_buf = None
self.kernel_buf = None # Keep kernel bytes alive
self.lib_addr = 0 # Base address of kernel code
def create(self, kernel: bytes, n_lanes: int):
import ctypes
from tinygrad.device import Buffer, BufferSpec
from tinygrad.dtype import dtypes
# Store kernel in a ctypes buffer so _decode_at can read from memory at actual PC address
self.kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
self.lib_addr = ctypes.addressof(self.kernel_buf)
self.program = {}
self.state = WaveState(n_lanes)
self.state.pc = self.lib_addr # Set PC to code base address
self.vmem_buf = Buffer('CPU', 1 << 40, dtypes.uint32, options=BufferSpec(external_ptr=0)).ensure_allocated()
self.lds_buf = Buffer('CPU', 65536 // 4, dtypes.uint32).ensure_allocated()
def _ensure_decoded(self, pc: int):
if pc not in self.program:
runner, _ = _decode_at(pc, "rdna3")
self.program[pc] = (runner.p.function_name, runner._prg.fxn, runner.p.globals)
def step(self) -> int:
import ctypes
assert self.state is not None
pc = self.state.pc
if pc == 0xFFFFFFFFFFFFFFFF: return -1
self._ensure_decoded(pc)
name, fxn, globals_list = self.program[pc]
buf_addrs = {0: self.state.sgpr_buf._buf.va_addr, 1: self.state.vgpr_buf._buf.va_addr, # type: ignore[union-attr]
2: self.vmem_buf._buf.va_addr, 3: self.lds_buf._buf.va_addr} # type: ignore[union-attr]
fxn(*[ctypes.c_uint64(buf_addrs[g]) for g in globals_list], ctypes.c_int32(0))
return -1 if self.state.pc == 0xFFFFFFFFFFFFFFFF else 0
def set_sgpr(self, idx: int, val: int):
assert self.state is not None
self.state._write_sgpr(idx, val)
def set_vgpr(self, lane: int, idx: int, val: int):
assert self.state is not None
self.state._write_vgpr(idx, lane, val)
def get_snapshot(self) -> StateSnapshot:
assert self.state is not None
sgpr = [self.state._read_sgpr(i) for i in range(128)]
vgpr = [[self.state._read_vgpr(reg, lane) for reg in range(256)] for lane in range(WAVE_SIZE)]
# Convert actual PC address to word offset for comparison with Rust emulator
pc_offset = (self.state.pc - self.lib_addr) // 4 if self.state.pc != 0xFFFFFFFFFFFFFFFF else 0xFFFFFFFFFFFFFFFF
return StateSnapshot(pc=pc_offset, scc=self.state._read_sgpr(SCC.offset), vcc=sgpr[VCC_LO.offset],
exec_mask=sgpr[EXEC_LO.offset], sgpr=sgpr, vgpr=vgpr)
def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: tuple[int, int, int],
local_size: tuple[int, int, int], max_steps: int, debug: bool, trace_len: int,
kernel_idx: int = 0, max_workgroups: int = 8) -> tuple[bool, str, int]:
"""Run a single kernel through both emulators. Returns (success, message, total_steps)."""
gx, gy, gz = global_size
lx, ly, lz = local_size
total_steps = 0
wg_count = 0
for gidz in range(gz):
for gidy in range(gy):
for gidx in range(gx):
if wg_count >= max_workgroups: return True, f"Completed {wg_count} workgroups (limit reached)", total_steps
wg_count += 1
rust = RustEmulator()
python = PythonEmulator()
rust.create(kernel, n_lanes)
python.create(kernel, n_lanes)
# Initialize LDS (64KB, standard size for AMD GPUs)
rust.init_lds(65536)
for emu in (rust, python):
emu.set_sgpr(0, args_ptr & 0xffffffff)
emu.set_sgpr(1, (args_ptr >> 32) & 0xffffffff)
emu.set_sgpr(13, gidx)
emu.set_sgpr(14, gidy)
emu.set_sgpr(15, gidz)
# Initialize v[0] with packed workitem IDs for each lane
for lane in range(n_lanes):
tid = lane
z, y, x = tid // (lx * ly), (tid // lx) % ly, tid % lx
emu.set_vgpr(lane, 0, (z << 20) | (y << 10) | x)
step = 0
trace: list[tuple[int, int, str, StateSnapshot, StateSnapshot]] = []
prev_sync_after = False # Track if previous instruction had known Rust bugs
try:
while step < max_steps:
rust_before = rust.get_snapshot()
python_before = python.get_snapshot()
pc_addr = python.lib_addr + python_before.pc * 4 # Convert word offset to actual address
python._ensure_decoded(pc_addr)
inst_hex_name = python.program[pc_addr][0]
# Decode the instruction to get mnemonic for sync_after checks
try:
# Format is mnemonic_hexbytes, e.g. v_exp_f32_e32_014b027e -> hex is 014b027e
parts = inst_hex_name.rsplit('_', 1)
inst_bytes_hex = parts[1] if len(parts) == 2 else ""
inst_bytes = bytes.fromhex(inst_bytes_hex) if inst_bytes_hex else b''
decoded = decode_inst(inst_bytes) if inst_bytes else None
inst_mnemonic = repr(decoded).split('(')[0] if decoded else ""
except Exception:
inst_mnemonic = ""
# For generic instructions, use function name for sync_after check
if not inst_mnemonic: inst_mnemonic = inst_hex_name
inst_str = inst_hex_name
trace.append((step, python_before.pc, inst_str, rust_before, python_before))
if len(trace) > trace_len: trace.pop(0)
if debug: print(f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: PC={python_before.pc}, inst={inst_str}")
# Instructions with known Rust emulator bugs or precision differences - sync Python to Rust after execution
# v_div_scale/v_div_fixup: Rust has different VCC handling
# v_cvt_f16_f32: Rust clears high 16 bits, but hardware (and Python) preserves them
# s_add_i32/s_sub_i32: Rust has incorrect SCC overflow detection
# v_exp_f32/v_log_f32/v_ldexp_f32: precision differences in transcendental functions
# s_delay_alu: Rust handles differently
# v_add_co_ci_u32/v_sub_co_ci_u32/v_subrev_co_ci_u32: Rust preserves inactive VCC bits, but hardware clears all bits
sync_after = any(x in inst_mnemonic.lower() for x in ('v_div_scale', 'v_div_fixup', 'v_cvt_f16_f32', 's_add_i32', 's_sub_i32',
'v_exp_f32', 'v_log_f32', 'v_ldexp_f32', 's_delay_alu',
'v_add_co_ci_u32', 'v_sub_co_ci_u32', 'v_subrev_co_ci_u32'))
# Skip comparison if previous instruction had known Rust bugs (states were synced but may still differ slightly)
diffs = rust_before.diff(python_before, n_lanes) if not prev_sync_after else []
if diffs:
trace_lines = []
for idx, (s, pc, d, rb, pb) in enumerate(trace):
trace_lines.append(f" step {s}: PC={pc:3d} {d}")
if idx < len(trace) - 1:
next_rb, next_pb = trace[idx + 1][3:5]
rust_diffs = rb.diff(next_rb, n_lanes, "->")
python_diffs = pb.diff(next_pb, n_lanes, "->")
if rust_diffs: trace_lines.append(f" rust: {', '.join(rust_diffs[:5])}")
if python_diffs: trace_lines.append(f" python: {', '.join(python_diffs[:5])}")
elif rust_diffs: trace_lines.append(" python: (no changes)")
else:
# Last traced instruction - compare with current state
rust_diffs = rb.diff(rust_before, n_lanes, "->")
python_diffs = pb.diff(python_before, n_lanes, "->")
if rust_diffs: trace_lines.append(f" rust: {', '.join(rust_diffs[:5])}")
if python_diffs: trace_lines.append(f" python: {', '.join(python_diffs[:5])}")
elif rust_diffs: trace_lines.append(" python: (no changes)")
trace_str = "\n".join(trace_lines)
msg = f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step} before inst '{inst_str}': states differ (rust vs python):\n "
msg += "\n ".join(diffs[:10]) + f"\n Recent instructions:\n{trace_str}"
return False, msg, total_steps
rust_result = rust.step()
python_result = python.step()
if rust_result != python_result:
# Rust returns 1 for unsupported instructions - skip test
if rust_result == 1 and python_result == 0:
raise unittest.SkipTest(f"Rust emulator doesn't support instruction: {inst_str}")
trace_str = "\n".join(f" step {s}: PC={pc:3d} {d}" for s, pc, d, _, _ in trace)
msg = (f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: different return codes: "
f"rust={rust_result}, python={python_result}, inst={inst_str}\n Recent instructions:\n{trace_str}")
return False, msg, total_steps
# Sync Python state to Rust after instructions with known Rust emulator differences
if sync_after:
rust_after = rust.get_snapshot()
for i in range(128): python.set_sgpr(i, rust_after.sgpr[i])
for lane in range(n_lanes):
for i in range(256): python.set_vgpr(lane, i, rust_after.vgpr[lane][i])
assert python.state is not None
# Convert Rust's word-based PC to Python's actual address
python.state.pc = python.lib_addr + rust_after.pc * 4
python.state._write_sgpr(SCC.offset, rust_after.scc)
python.state._write_sgpr(VCC_LO.offset, rust_after.vcc)
python.state._write_sgpr(EXEC_LO.offset, rust_after.exec_mask)
prev_sync_after = sync_after
if rust_result == -1:
total_steps += step + 1
break
if rust_result == 1:
total_steps += step + 1
break
if rust_result < 0 and rust_result != -2:
return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: error code {rust_result}", total_steps
step += 1
else:
return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Max steps ({max_steps}) reached", total_steps
finally:
rust.free()
return True, f"Completed {gx*gy*gz} workgroups", total_steps
def compare_emulators_multi_kernel(kernels: list[KernelSnapshot], buf_pool: dict[int, int], max_steps: int = 1000,
debug: bool = False, trace_len: int = 10, buf_data: dict[int, bytes] | None = None) -> tuple[bool, str]:
"""Run all kernels through both emulators with shared buffer pool."""
if buf_data is None: buf_data = {}
# Allocate shared buffer pool with padding for over-reads (GPU loads up to 16 bytes at once)
buf_id_to_ptr: dict[int, int] = {}
buffers = []
for buf_id, size in buf_pool.items():
padded_size = ((size + 15) // 16) * 16 + 16 # round up to 16 bytes + extra padding
# Initialize with data from COPY if available
init_data = buf_data.get(buf_id, b'\x00' * padded_size)
init_list = list(init_data) + [0] * (padded_size - len(init_data))
buf = (ctypes.c_uint8 * padded_size)(*init_list[:padded_size])
buffers.append((buf, padded_size))
buf_id_to_ptr[buf_id] = ctypes.addressof(buf)
# Set up valid memory ranges
ranges = {(ctypes.addressof(b), size) for b, size in buffers}
total_steps = 0
for ki, kernel in enumerate(kernels):
# Create args array for this kernel's buffers
args = (ctypes.c_uint64 * len(kernel.buf_idxs))(*[buf_id_to_ptr[bid] for bid in kernel.buf_idxs])
args_ptr = ctypes.addressof(args)
# Update valid ranges to include this args array
kernel_ranges = ranges | {(args_ptr, ctypes.sizeof(args))}
set_valid_mem_ranges(kernel_ranges)
n_lanes = kernel.local_size[0] * kernel.local_size[1] * kernel.local_size[2]
ok, msg, steps = run_single_kernel(
kernel.code, min(n_lanes, 32), args_ptr, kernel.global_size,
kernel.local_size, max_steps, debug, trace_len, ki
)
total_steps += steps
if not ok:
return False, msg
return True, f"Completed {len(kernels)} kernels, {total_steps} total steps"
def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list, max_steps: int = 1000, debug: bool = False,
global_size: tuple[int, int, int] = (1, 1, 1), trace_len: int = 10) -> tuple[bool, str]:
"""Run both emulators with memory set up for tinygrad kernels, executing all workgroups. Legacy wrapper."""
# Allocate buffers
buffers = []
for size in buf_sizes:
buf = (ctypes.c_uint8 * size)(*[0] * size)
buffers.append(buf)
# Create args array with buffer pointers
args = (ctypes.c_uint64 * len(buffers))(*[ctypes.addressof(b) for b in buffers])
args_ptr = ctypes.addressof(args)
# Set up valid memory ranges for Python emulator
ranges = {(ctypes.addressof(b), len(b)) for b in buffers}
ranges.add((args_ptr, ctypes.sizeof(args)))
set_valid_mem_ranges(ranges)
# Legacy wrapper assumes local_size = (n_lanes, 1, 1)
ok, msg, _ = run_single_kernel(kernel, n_lanes, args_ptr, global_size, (n_lanes, 1, 1), max_steps, debug, trace_len)
return ok, msg
def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelSnapshot], dict[int, int], dict[int, bytes]]:
"""Compile a tinygrad operation and extract all kernels with their buffer mappings."""
from tinygrad import Tensor
from tinygrad.runtime.support.elf import elf_loader
out = op_fn(Tensor)
sched = out.schedule()
kernels = []
buf_pool: dict[int, int] = {} # buffer id -> size
buf_data: dict[int, bytes] = {} # buffer id -> initial data from COPY
for ei in sched:
lowered = ei.lower()
if ei.ast.op.name == 'COPY':
# Handle COPY: extract source data to initialize destination buffer
if len(lowered.bufs) >= 2:
dst_buf, src_buf = lowered.bufs[0], lowered.bufs[1]
dst_id = id(dst_buf)
if dst_id not in buf_pool:
buf_pool[dst_id] = dst_buf.nbytes
# Get source data if it's from numpy/CPU
if hasattr(src_buf, 'base') and src_buf.base is not None and hasattr(src_buf.base, '_buf'):
src_data = bytes(src_buf.base._buf)
buf_data[dst_id] = src_data
elif ei.ast.op.name == 'SINK':
if lowered.prg and lowered.prg.p.lib:
lib = bytes(lowered.prg.p.lib)
_, sections, _ = elf_loader(lib)
for sec in sections:
if sec.name == '.text':
buf_idxs = []
buf_sizes = []
for b in lowered.bufs:
buf_id = id(b)
if buf_id not in buf_pool:
buf_pool[buf_id] = b.nbytes
buf_idxs.append(buf_id)
buf_sizes.append(b.nbytes)
kernels.append(KernelSnapshot(
code=bytes(sec.content),
src=lowered.prg.p.src,
global_size=tuple(lowered.prg.p.global_size),
local_size=tuple(lowered.prg.p.local_size),
buf_idxs=buf_idxs,
buf_sizes=buf_sizes
))
if not kernels: raise RuntimeError("No kernel found")
return kernels, buf_pool, buf_data
def get_kernel_from_tinygrad(op_fn) -> tuple[bytes, tuple[int, int, int], tuple[int, int, int], list]:
"""Compile a tinygrad operation and extract the last (main) kernel binary. Legacy wrapper."""
kernels, _, _ = get_kernels_from_tinygrad(op_fn)
k = kernels[-1]
return k.code, k.global_size, k.local_size, k.buf_sizes
@unittest.skipUnless(Device.DEFAULT == "AMD", "requires AMD device")
class TestTinygradKernels(unittest.TestCase):
"""Compare emulators on real tinygrad-compiled kernels."""
def _test_kernel(self, op_fn, max_steps=10000):
kernels, buf_pool, buf_data = get_kernels_from_tinygrad(op_fn)
ok, msg = compare_emulators_multi_kernel(kernels, buf_pool, max_steps=max_steps, buf_data=buf_data)
self.assertTrue(ok, msg)
# Basic ops - consolidated tests covering key instruction patterns
def test_unary_ops(self): self._test_kernel(lambda T: T([-1.0, 0.0, 1.0, 2.0]).relu().exp().log().sqrt().reciprocal())
def test_binary_ops(self): self._test_kernel(lambda T: (T([1.0, 2.0]) + T([3.0, 4.0])) * T([0.5, 0.5]) - T([1.0, 1.0]))
def test_trig(self): self._test_kernel(lambda T: T([0.1, 1.0, 3.14, -1.0]*8).sin() + T([0.1, 1.0, 3.14, -1.0]*8).cos())
def test_compare(self): self._test_kernel(lambda T: (T.empty(64) < T.empty(64)).where(T.empty(64), T.empty(64)))
def test_bitwise(self): self._test_kernel(lambda T: (T([0xF0, 0x0F, 0xFF]*11).int() & T([0x0F, 0x0F, 0x00]*11).int()) | T([1]*33).int())
def test_int_ops(self): self._test_kernel(lambda T: ((T.empty(64).int() + T.empty(64).int()) * T.empty(64).int()).float())
# Reductions
def test_reduce(self): self._test_kernel(lambda T: T.empty(64).sum() + T.empty(64).max())
def test_argmax(self): self._test_kernel(lambda T: T.empty(64).argmax())
# Matmul
def test_gemm(self): self._test_kernel(lambda T: T.empty(8, 8) @ T.empty(8, 8), max_steps=100000)
@unittest.skip("Rust emulator crashes on this kernel (assertion failure in thread.rs)")
def test_gemm_fp16(self): self._test_kernel(lambda T: T.empty(16, 16).half() @ T.empty(16, 16).half(), max_steps=100000)
# Complex ops
def test_softmax(self): self._test_kernel(lambda T: T.empty(16).softmax())
def test_layernorm(self): self._test_kernel(lambda T: T.empty(8, 8).layernorm())
# Memory patterns
def test_memory(self): self._test_kernel(lambda T: T.empty(4, 4).permute(1, 0).contiguous() + T.empty(4, 1).expand(4, 4))
# Cast ops
def test_cast(self): self._test_kernel(lambda T: T.empty(32).half().float() + T.empty(32).int().float())
# Pooling - regression for VCC wave32 mode
def test_pool2d(self):
self._test_kernel(lambda T: T.empty(1, 1, 8, 8).avg_pool2d(kernel_size=(4,4)) + T.empty(1, 1, 8, 8).max_pool2d(kernel_size=(4,4)))
# Convolution
def test_conv2d(self): self._test_kernel(lambda T: T.empty(1, 2, 8, 8).conv2d(T.empty(2, 2, 3, 3)), max_steps=50000)
# Regression tests
def test_topk(self): self._test_kernel(lambda T: T.empty(64).topk(3)[0])
def test_interpolate(self): self._test_kernel(lambda T: T.empty(1,2,16,16).relu().cast('uint8').interpolate((8,8), mode="linear"))
def test_index_int64(self):
from tinygrad import dtypes
self._test_kernel(lambda T: T.empty(4, 4)[T.arange(4).cast(dtypes.int64), :])
def test_gelu(self): self._test_kernel(lambda T: T.empty(32, 32).gelu())
def test_exp(self): self._test_kernel(lambda T: T.empty(1024).exp())
def test_cross_entropy(self):
import numpy as np
np.random.seed(0)
classes = np.random.randint(0, 10, (16,), dtype=np.int32).tolist()
x_np = np.random.randn(16, 10).astype(np.float32)
self._test_kernel(lambda T: (T(x_np.tolist()).reshape(16,10) + 0).cross_entropy((T(classes).int().reshape(16) + 0)))
def test_isinf(self): self._test_kernel(lambda T: T([float('-inf'), 0., float('inf'), 1.1]*8).isinf())
def test_sin_f64(self):
from tinygrad import dtypes
self._test_kernel(lambda T: T([2.0], dtype=dtypes.float64).sin())
def test_sin_large_f32(self):
"""Test sin with large values that trigger Payne-Hanek range reduction."""
# Values around 859240 trigger the Payne-Hanek algorithm
# This tests the integer multiply-high instructions used in range reduction
self._test_kernel(lambda T: T([859240.0, 1000000.0, 100594688.0]).sin())
def test_clip_zero_one(self):
"""Test clip(0, 1) - regression for binary_crossentropy failure."""
import numpy as np
np.random.seed(0)
x_np = np.random.uniform(-2, 2, (32, 10)).astype(np.float32).tolist()
self._test_kernel(lambda T: T(x_np).clip(0, 1))
def test_mod_int64(self):
"""Test int64 modulo, especially edge cases like 1 % -1."""
from tinygrad import dtypes
self._test_kernel(lambda T: T([1, 10, -10, 7], dtype=dtypes.int64) % T([-1, 3, 3, -3], dtype=dtypes.int64))
def test_expand_flatten_sum(self):
"""Test flatten of expanded tensor followed by sum.
Bug: flatten() of an expanded tensor produces wrong results for certain sizes.
Sizes that are multiples of 32 work (32, 48, 64), but sizes like 33, 49, 50 fail.
This breaks masked_select and nonzero operations.
"""
import numpy as np
np.random.seed(0)
x_np = np.random.uniform(-2, 2, (33,)).astype(np.float32)
self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).unsqueeze(-1).expand(33, 3).flatten().sum())
@unittest.skip("slow and broken with AMD:LLVM")
def test_nonzero(self):
"""Test nonzero operation - counts and gathers indices of non-zero elements."""
import numpy as np
np.random.seed(42)
x_np = np.random.rand(10, 5, 3).astype(np.float32)
self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).nonzero())
@unittest.skip("Precision differences in v_exp/v_log accumulate across kernels, causing memory divergence")
def test_softmax_argmax_fused(self):
"""Test fused softmax+argmax - tracks exp2 precision issue.
The fused kernel recomputes softmax inline and Python emulator's exp2 polynomial
has up to 1 ULP error vs native exp2f, causing accumulated differences.
"""
import torch
torch.manual_seed(0)
x_np = torch.rand(4, 10).numpy()
self._test_kernel(lambda T: T(x_np.tolist()).softmax(1).argmax())
if __name__ == "__main__":
unittest.main()

View file

@ -39,7 +39,6 @@ dev.synchronize()
env = os.environ.copy()
env["AMD"] = "1"
env["MOCKGPU"] = "1"
env["PYTHON_REMU"] = "1"
env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"
st = time.perf_counter()

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
"""Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match."""
import unittest, io, sys, re
from dataclasses import dataclass
from tinygrad import Device
from tinygrad.renderer.amd import detect_format
from test.amd.helpers import llvm_assemble, llvm_disasm, get_target, get_mattr
@ -44,6 +45,64 @@ def compile_and_disasm_batch(instrs: list[str], arch: str = 'rdna3') -> list[str
code = b''.join(llvm_assemble(instrs, mcpu, mattr))
return llvm_disasm(code, mcpu, mattr)[:len(instrs)]
@dataclass
class KernelSnapshot:
code: bytes
src: str
global_size: tuple[int, int, int]
local_size: tuple[int, int, int]
buf_idxs: list[int] # indices into shared buffer pool
buf_sizes: list[int] # sizes for each buffer index
def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelSnapshot], dict[int, int], dict[int, bytes]]:
"""Compile a tinygrad operation and extract all kernels with their buffer mappings."""
from tinygrad import Tensor
from tinygrad.runtime.support.elf import elf_loader
out = op_fn(Tensor)
sched = out.schedule()
kernels = []
buf_pool: dict[int, int] = {} # buffer id -> size
buf_data: dict[int, bytes] = {} # buffer id -> initial data from COPY
for ei in sched:
lowered = ei.lower()
if ei.ast.op.name == 'COPY':
# Handle COPY: extract source data to initialize destination buffer
if len(lowered.bufs) >= 2:
dst_buf, src_buf = lowered.bufs[0], lowered.bufs[1]
dst_id = id(dst_buf)
if dst_id not in buf_pool:
buf_pool[dst_id] = dst_buf.nbytes
# Get source data if it's from numpy/CPU
if hasattr(src_buf, 'base') and src_buf.base is not None and hasattr(src_buf.base, '_buf'):
src_data = bytes(src_buf.base._buf)
buf_data[dst_id] = src_data
elif ei.ast.op.name == 'SINK':
if lowered.prg and lowered.prg.p.lib:
lib = bytes(lowered.prg.p.lib)
_, sections, _ = elf_loader(lib)
for sec in sections:
if sec.name == '.text':
buf_idxs = []
buf_sizes = []
for b in lowered.bufs:
buf_id = id(b)
if buf_id not in buf_pool:
buf_pool[buf_id] = b.nbytes
buf_idxs.append(buf_id)
buf_sizes.append(b.nbytes)
kernels.append(KernelSnapshot(
code=bytes(sec.content),
src=lowered.prg.p.src,
global_size=tuple(lowered.prg.p.global_size),
local_size=tuple(lowered.prg.p.local_size),
buf_idxs=buf_idxs,
buf_sizes=buf_sizes
))
if not kernels: raise RuntimeError("No kernel found")
return kernels, buf_pool, buf_data
@unittest.skipUnless(Device.DEFAULT == "AMD", "requires AMD device")
class TestTinygradKernelRoundtrip(unittest.TestCase):
"""Test roundtrip on real tinygrad-generated kernels using get_kernels_from_tinygrad pattern."""
@ -57,7 +116,6 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
"""
arch = self.arch
from test.amd.test_compare_emulators import get_kernels_from_tinygrad
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
from tinygrad.helpers import DEV

View file

@ -4,7 +4,7 @@ Test with `pytest -n12 test/amd/`
`DEV=AMD:LLVM pytest -n12 test/amd/`
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1 PYTHON_REMU=1`
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1`
* generate.py -- extract assembly format + instruction pseudocode from AMD XML + PDF
* test/mockgpu/amd/pcode.py -- pseudocode to UOp transformation
* sqtt.py -- SQTT parser
@ -20,20 +20,19 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
The ops tests also pass, but they are very slow, so you should run them one at a time.
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
If a test is failing with `DEV=AMD PYTHON_REMU=1 MOCKGPU=1` it's because an instruction is emulated incorrectly.
If a test is failing with `DEV=AMD MOCKGPU=1` it's because an instruction is emulated incorrectly.
You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.
Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.
Get line count with `cloc --by-file tinygrad/renderer/amd/*.py`

View file

@ -1,7 +1,7 @@
import ctypes, time
from dataclasses import replace
from test.mockgpu.gpu import VirtGPU
from test.mockgpu.helpers import _try_dlopen_remu
from test.mockgpu.helpers import PythonRemu
from tinygrad.helpers import getbits, to_mv, getenv, DEV
from tinygrad.runtime.support import c
@ -41,7 +41,7 @@ WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
remu = _try_dlopen_remu()
remu = PythonRemu()
def create_sdma_packets():
# TODO: clean up this, if we want to keep it
@ -212,13 +212,13 @@ class PM4Executor(AMDQueue):
scratch_size = wavesize * (16 if self.gpu.arch == "cdna" else 4) # per-thread scratch size in bytes
assert prg_sz > 0, "Invalid prg ptr (not found in mapped ranges)"
# Pass valid memory ranges, rsrc2, scratch_size, arch, and user data registers to Python emulator
if hasattr(remu, 'valid_mem_ranges'): remu.valid_mem_ranges = self.gpu.mapped_ranges
if hasattr(remu, 'rsrc2'): remu.rsrc2 = rsrc2
if hasattr(remu, 'scratch_size'): remu.scratch_size = scratch_size
if hasattr(remu, 'arch'): remu.arch = self.gpu.arch
if hasattr(remu, 'user_data'): remu.user_data = user_data
err = remu.run_asm(prg_addr, prg_sz, *gl, *lc, args_addr)
# Pass valid memory ranges, rsrc2, scratch_size, arch, and user data registers to the emulator
remu.valid_mem_ranges = self.gpu.mapped_ranges
remu.rsrc2 = rsrc2
remu.scratch_size = scratch_size
remu.arch = self.gpu.arch
remu.user_data = user_data
err = remu.run_asm(prg_addr, prg_sz, gl[0], gl[1], gl[2], lc[0], lc[1], lc[2], args_addr)
if err != 0: raise RuntimeError("remu does not support the new instruction introduced in this kernel")
def _exec_indirect_buffer(self, n):

View file

@ -233,7 +233,6 @@ VOPD_TO_VOP2 = {
ir4.VOPDOp.V_DUAL_DOT2ACC_F32_F16: ir3.VOP2Op.V_DOT2ACC_F32_F16_E32,
}
def _wave_size(arch: str) -> int: return 64 if arch.startswith("cdna") else 32
WAVE_SIZE = 32 # default wave size for RDNA (exported for test_compare_emulators)
# Special registers stored after inline constants (256-259)
PC_LO_IDX, PC_HI_IDX, SCRATCH_STRIDE_IDX = 256, 257, 259
# SGPR buffer: 0-127 = SGPRs, 128-255 = inline constants, 256-259 = special registers

View file

@ -1,5 +1,4 @@
import ctypes, ctypes.util
from tinygrad.helpers import getenv
def _try_dlopen_gpuocelot():
GPUOCELOT_PATHS = [ctypes.util.find_library("gpuocelot")] if ctypes.util.find_library("gpuocelot") is not None else []
@ -16,7 +15,7 @@ def _try_dlopen_gpuocelot():
return None
class PythonRemu:
"""Python RDNA3/RDNA4 emulator wrapper that matches the libremu.so interface."""
"""Python RDNA3/RDNA4 emulator wrapper used by mockgpu."""
valid_mem_ranges: set[tuple[int, int]] = set()
rsrc2: int = 0x19c # Default: USER_SGPR_COUNT=14, enable X and Y workgroup IDs
scratch_size: int = 0 # private_segment_fixed_size from kernel descriptor
@ -26,20 +25,3 @@ class PythonRemu:
def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int:
from test.mockgpu.amd.emu import run_asm
return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2, self.scratch_size, self.arch, self.user_data)
def _try_dlopen_remu():
# Use Python emulator only if PYTHON_REMU=1
if int(getenv("PYTHON_REMU", "1")):
return PythonRemu()
REMU_PATHS = ["extra/remu/target/release/libremu.so", "libremu.so", "/usr/local/lib/libremu.so",
"extra/remu/target/release/libremu.dylib", "libremu.dylib", "/usr/local/lib/libremu.dylib", "/opt/homebrew/lib/libremu.dylib"]
for path in REMU_PATHS:
try:
remu = ctypes.CDLL(path)
remu.run_asm.restype = ctypes.c_int32
remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32,
ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p]
except OSError: pass
else: return remu
print("Could not find libremu.so")
return None