mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
remove REMU from tree (#15706)
* no more compare emulators * remove remu from tree
This commit is contained in:
parent
ac027055ef
commit
16f50a40a5
22 changed files with 77 additions and 5732 deletions
5
.github/actions/setup-tinygrad/action.yml
vendored
5
.github/actions/setup-tinygrad/action.yml
vendored
|
|
@ -225,14 +225,12 @@ runs:
|
|||
if: inputs.amd == 'true' && runner.os == 'Linux'
|
||||
shell: bash
|
||||
run: |
|
||||
cargo build --release --manifest-path ./extra/remu/Cargo.toml
|
||||
sudo ln -sf ${{ github.workspace }}/extra/remu/target/release/libremu.so /usr/local/lib/libremu.so
|
||||
sudo tee --append /etc/ld.so.conf.d/rocm.conf <<'EOF'
|
||||
/opt/rocm/lib
|
||||
/opt/rocm/lib64
|
||||
EOF
|
||||
sudo ldconfig
|
||||
- name: Setup AMD comgr+remu (macOS)
|
||||
- name: Setup AMD comgr (macOS)
|
||||
if: inputs.amd == 'true' && runner.os == 'macOS'
|
||||
shell: bash
|
||||
run: |
|
||||
|
|
@ -240,7 +238,6 @@ runs:
|
|||
curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/tinygrad/amdcomgr_dylib/releases/latest | \
|
||||
jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
|
||||
sudo xargs curl -fL -o /usr/local/lib/libamd_comgr.dylib
|
||||
cargo build --release --manifest-path ./extra/remu/Cargo.toml
|
||||
|
||||
# **** gpuocelot ****
|
||||
|
||||
|
|
|
|||
4
.github/workflows/benchmark.yml
vendored
4
.github/workflows/benchmark.yml
vendored
|
|
@ -71,10 +71,6 @@ jobs:
|
|||
uv venv /tmp/tinygrad_pytest_ci
|
||||
source /tmp/tinygrad_pytest_ci/bin/activate
|
||||
uv pip install .[testing]
|
||||
- name: setup other stuff
|
||||
run: |
|
||||
mkdir -p extra/remu/target/release/
|
||||
ln -s ~/tinygrad/extra/remu/target/release/libremu.so extra/remu/target/release/libremu.so
|
||||
- name: setup staging db
|
||||
run: |
|
||||
echo "CACHEDB=/tmp/pytest-db-ci.db" >> $GITHUB_ENV
|
||||
|
|
|
|||
1
.github/workflows/test.yml
vendored
1
.github/workflows/test.yml
vendored
|
|
@ -644,7 +644,6 @@ jobs:
|
|||
timeout-minutes: 20
|
||||
env:
|
||||
DEV: AMD
|
||||
PYTHON_REMU: 1
|
||||
MOCKGPU: 1
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
|
|
|
|||
66
extra/remu/Cargo.lock
generated
66
extra/remu/Cargo.lock
generated
|
|
@ -1,66 +0,0 @@
|
|||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "crunchy"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
|
||||
|
||||
[[package]]
|
||||
name = "float-cmp"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4"
|
||||
dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "half"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"crunchy",
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libm"
|
||||
version = "0.2.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"libm",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "remu"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"float-cmp",
|
||||
"half",
|
||||
"num-traits",
|
||||
]
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
[package]
|
||||
name = "remu"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
rust-version = "1.80.0"
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
|
||||
[dependencies]
|
||||
half = { version = "2.3.1", features = ["num-traits"] }
|
||||
num-traits = "0.2.17"
|
||||
|
||||
[dev-dependencies]
|
||||
float-cmp = "0.9.0"
|
||||
|
|
@ -1,80 +0,0 @@
|
|||
## Intro
|
||||
|
||||
Remu is an RDNA3 emulator built to test correctness of RDNA3 code. It is used in [tinygrad's AMD CI](https://github.com/tinygrad/tinygrad).
|
||||
|
||||
Most of the common instructions are implemented, but some formats like IMG are not supported.
|
||||
|
||||
Remu is only for testing correctness of program output, it is not a cycle accurate simulator.
|
||||
|
||||
## Build Locally
|
||||
|
||||
Remu is written in Rust. Make sure you have [Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html).
|
||||
|
||||
To build the project, run:
|
||||
|
||||
```bash
|
||||
cargo build --release --manifest-path ./extra/remu/Cargo.toml
|
||||
```
|
||||
|
||||
This will produce a binary in the `extra/remu/target/release` directory.
|
||||
|
||||
## Usage with tinygrad
|
||||
|
||||
The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally).
|
||||
|
||||
Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `DEV=AMD:LLVM` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
|
||||
|
||||
`PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu.
|
||||
|
||||
Add `DEBUG=6` to see Remu's logs.
|
||||
|
||||
### DEBUG output
|
||||
|
||||
Remu runs each thread one at a time in a nested for loop, see lib.rs. The DEBUG output prints information about the current thread.
|
||||
|
||||
The DEBUG output has 3 sections:
|
||||
|
||||
```
|
||||
<------------ 1 ----------> <--- 2 ---> <--------------------------------------- 3 ------------------------------------------>
|
||||
[0 0 0 ] [0 0 0 ] 0 F4080100 SMEM { op: 2, sdata: 4, sbase: 0, offset: 0, soffset: 124, glc: false, dlc: false }
|
||||
```
|
||||
|
||||
#### Section 1: Grid info
|
||||
|
||||
`[gid.x, gid.y, gid.z], [lid.x, lid.y, lid.z]` of the current thread.
|
||||
|
||||
#### Section 2: Wave info
|
||||
|
||||
`<lane> <instruction hex>`
|
||||
|
||||
RDNA3 divides threads into chunks of 32. Each thread is assigned to a "lane" from 0-31.
|
||||
|
||||
In Remu, even though all threads run one at a time, each 32 thread chunk (a wave) shares state like SGPR, VGPR, LDS, EXEC mask, etc.
|
||||
Remu can simulate up to one wave sync instruction.
|
||||
For more details, see work_group.rs.
|
||||
|
||||
Section 2 can have a green or gray color.
|
||||
|
||||
Green = The thread is actively executing the instruction.
|
||||
|
||||
Gray = The thread has been "turned off" by the EXEC mask, it skips execution of some instructions. (refer to "EXECute Mask" on [page 23](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf#page=23) of ISA docs for more details.)
|
||||
|
||||
To see the colors in action, try running `DEBUG=6 PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_ops.py TestOps.test_arange_big`. See how only lane 0 writes to global memory:
|
||||
```
|
||||
[255 0 0 ] [0 0 0 ] 0 DC6A0000 FLAT { op: 26, offset: 0, dlc: false, glc: false, slc: false, seg: 2, addr: 8, data: 0, saddr: 0, sve: false, vdst: 0 }
|
||||
[255 0 0 ] [1 0 0 ] 1 DC6A0000
|
||||
[255 0 0 ] [2 0 0 ] 2 DC6A0000
|
||||
[255 0 0 ] [3 0 0 ] 3 DC6A0000
|
||||
[255 0 0 ] [3 0 0 ] 4 DC6A0000
|
||||
```
|
||||
|
||||
#### Section 3: Decoded Instruction
|
||||
|
||||
This prints the instruction type and all the parsed bitfields.
|
||||
|
||||
Remu output vs llvm-objdump:
|
||||
|
||||
```
|
||||
s_load_b64 s[0:1], s[0:1], 0x10 // 00000000160C: F4040000 F8000010
|
||||
SMEM { op: 1, sdata: 0, sbase: 0, offset: 16, soffset: 124, glc: false, dlc: false }
|
||||
```
|
||||
|
|
@ -1 +0,0 @@
|
|||
max_width = 150
|
||||
|
|
@ -1,162 +0,0 @@
|
|||
use half::f16;
|
||||
use num_traits::{float::FloatCore, PrimInt, Unsigned, clamp};
|
||||
|
||||
pub fn bits<T>(word: T, hi: usize, lo: usize) -> T where T: PrimInt + Unsigned {
|
||||
assert!(hi >= lo);
|
||||
let width = hi - lo + 1;
|
||||
(word >> lo) & ((T::one() << width) - T::one())
|
||||
}
|
||||
|
||||
pub fn nth(val: u32, pos: usize) -> u32 {
|
||||
(val >> (31 - pos as u32)) & 1
|
||||
}
|
||||
pub fn f16_lo(val: u32) -> f16 {
|
||||
f16::from_bits((val & 0xffff) as u16)
|
||||
}
|
||||
pub fn f16_hi(val: u32) -> f16 {
|
||||
f16::from_bits(((val >> 16) & 0xffff) as u16)
|
||||
}
|
||||
|
||||
pub fn sign_ext(num: u64, bits: usize) -> i64 {
|
||||
let mut value = num;
|
||||
let is_negative = (value >> (bits - 1)) & 1 != 0;
|
||||
if is_negative {
|
||||
value |= !0 << bits;
|
||||
}
|
||||
value as i64
|
||||
}
|
||||
|
||||
pub trait IEEEClass<T> {
|
||||
fn exponent(&self) -> T;
|
||||
}
|
||||
impl IEEEClass<u32> for f32 {
|
||||
fn exponent(&self) -> u32 {
|
||||
(self.to_bits() & 0b01111111100000000000000000000000) >> 23
|
||||
}
|
||||
}
|
||||
impl IEEEClass<u16> for f16 {
|
||||
fn exponent(&self) -> u16 {
|
||||
(self.to_bits() & 0b0111110000000000) >> 10
|
||||
}
|
||||
}
|
||||
impl IEEEClass<u64> for f64 {
|
||||
fn exponent(&self) -> u64 {
|
||||
(self.to_bits() & 0b0111111111110000000000000000000000000000000000000000000000000000) >> 52
|
||||
}
|
||||
}
|
||||
|
||||
pub trait VOPModifier<T> {
|
||||
fn negate(&self, pos: usize, modifier: usize) -> T;
|
||||
fn absolute(&self, pos: usize, modifier: usize) -> T;
|
||||
fn clmp(&self, cm: bool) -> T;
|
||||
}
|
||||
impl<T> VOPModifier<T> for T
|
||||
where
|
||||
T: FloatCore,
|
||||
{
|
||||
fn negate(&self, pos: usize, modifier: usize) -> T {
|
||||
match (modifier >> pos) & 1 {
|
||||
1 => -*self,
|
||||
_ => *self,
|
||||
}
|
||||
}
|
||||
fn absolute(&self, pos: usize, modifier: usize) -> T {
|
||||
match (modifier >> pos) & 1 {
|
||||
1 => self.abs(),
|
||||
_ => *self,
|
||||
}
|
||||
}
|
||||
fn clmp(&self, cm:bool) -> T {
|
||||
if !cm { return *self }
|
||||
let r = clamp(*self, T::zero(), T::one());
|
||||
if r == T::zero() { T::zero() } else { r }
|
||||
}
|
||||
}
|
||||
|
||||
pub fn extract_mantissa(x: f64) -> f64 {
|
||||
if x.is_infinite() || x.is_nan() {
|
||||
return x;
|
||||
}
|
||||
let bits = x.to_bits();
|
||||
let mantissa_mask: u64 = 0x000FFFFFFFFFFFFF;
|
||||
let bias: u64 = 1023;
|
||||
let normalized_mantissa_bits = (bits & mantissa_mask) | ((bias - 1) << 52);
|
||||
return f64::from_bits(normalized_mantissa_bits);
|
||||
}
|
||||
pub fn ldexp(x: f64, exp: i32) -> f64 {
|
||||
x * 2f64.powi(exp)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
#[test]
|
||||
fn test_extract_mantissa() {
|
||||
assert_eq!(extract_mantissa(2.0f64), 0.5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normal_exponent() {
|
||||
assert_eq!(2.5f32.exponent(), 128);
|
||||
assert_eq!(1.17549435e-38f32.exponent(), 1);
|
||||
assert_eq!(f32::INFINITY.exponent(), 255);
|
||||
assert_eq!(f32::NEG_INFINITY.exponent(), 255);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_denormal_exponent() {
|
||||
assert_eq!(1.0e-40f32.exponent(), 0);
|
||||
assert_eq!(1.0e-42f32.exponent(), 0);
|
||||
assert_eq!(1.0e-44f32.exponent(), 0);
|
||||
assert_eq!((1.17549435e-38f32 / 2.0).exponent(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normal_exponent_f16() {
|
||||
assert_eq!(f16::from_f32(3.14f32).exponent(), 16);
|
||||
assert_eq!(f16::NEG_INFINITY.exponent(), 31);
|
||||
assert_eq!(f16::INFINITY.exponent(), 31);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_neg() {
|
||||
assert_eq!(0.3_f32.negate(0, 0b001), -0.3_f32);
|
||||
assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
|
||||
assert_eq!(0.3_f32.negate(2, 0b100), -0.3_f32);
|
||||
assert_eq!(0.3_f32.negate(0, 0b110), 0.3_f32);
|
||||
assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
|
||||
assert_eq!(0.0_f32.negate(0, 0b001).to_bits(), (-0.0f32).to_bits());
|
||||
assert_eq!((-0.0_f32).negate(0, 0b001).to_bits(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_sign_ext() {
|
||||
assert_eq!(sign_ext(0b000000000000000101000, 21), 40);
|
||||
assert_eq!(sign_ext(0b111111111111111011000, 21), -40);
|
||||
assert_eq!(sign_ext(0b000000000000000000000, 21), 0);
|
||||
assert_eq!(sign_ext(0b111111111111111111111, 21), -1);
|
||||
assert_eq!(sign_ext(0b111000000000000000000, 21), -262144);
|
||||
assert_eq!(sign_ext(0b000111111111111111111, 21), 262143);
|
||||
assert_eq!(sign_ext(7608, 13), -584);
|
||||
}
|
||||
}
|
||||
|
||||
use std::sync::LazyLock;
|
||||
pub static DEBUG: LazyLock<bool> = LazyLock::new(|| std::env::var("DEBUG").map(|v| v.parse::<usize>().unwrap_or(0) >= 6).unwrap_or(false));
|
||||
|
||||
pub fn colored(st:&str, color:&str) -> String {
|
||||
let ansi_code = match color {
|
||||
"green" => format!("\x1b[{};2;39;176;139m", 38),
|
||||
"gray" => format!("\x1b[{};2;169;169;169m", 38),
|
||||
_ => format!("\x1b[{};2;255;255;255m", 38),
|
||||
};
|
||||
format!("{}{}{}", ansi_code, st, "\x1b[0m")
|
||||
}
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! todo_instr {
|
||||
($x:expr) => {{
|
||||
println!("{:08X}", $x);
|
||||
Err(1)
|
||||
}};
|
||||
}
|
||||
|
|
@ -1,77 +0,0 @@
|
|||
use crate::state::StateSnapshot;
|
||||
use crate::work_group::{WaveContext, WorkGroup};
|
||||
use std::os::raw::c_char;
|
||||
use std::slice;
|
||||
mod helpers;
|
||||
mod rdna3;
|
||||
mod state;
|
||||
mod thread;
|
||||
mod work_group;
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn run_asm(lib: *const c_char, lib_sz: u32, gx: u32, gy: u32, gz: u32, lx: u32, ly: u32, lz: u32, args_ptr: *const u64) -> i32 {
|
||||
if lib.is_null() || (lib_sz % 4) != 0 {
|
||||
panic!("Pointer is null or length is not properly aligned to 4 bytes");
|
||||
}
|
||||
let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() };
|
||||
let dispatch_dim = match (gy != 1, gz != 1) {
|
||||
(true, true) => 3,
|
||||
(true, false) => 2,
|
||||
_ => 1,
|
||||
};
|
||||
for gx in 0..gx {
|
||||
for gy in 0..gy {
|
||||
for gz in 0..gz {
|
||||
let mut wg = WorkGroup::new(dispatch_dim, [gx, gy, gz], [lx, ly, lz], &kernel, args_ptr);
|
||||
if let Err(err) = wg.exec_waves() {
|
||||
return err;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
0
|
||||
}
|
||||
|
||||
// FFI functions for single-stepping comparison tests
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn wave_create(lib: *const c_char, lib_sz: u32, n_lanes: u32) -> *mut WaveContext {
|
||||
if lib.is_null() || (lib_sz % 4) != 0 { return std::ptr::null_mut(); }
|
||||
let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() };
|
||||
Box::into_raw(Box::new(WaveContext::new(kernel, n_lanes as usize)))
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn wave_step(ctx: *mut WaveContext) -> i32 {
|
||||
if ctx.is_null() { return -99; }
|
||||
unsafe { (*ctx).step() }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn wave_get_snapshot(ctx: *const WaveContext, out: *mut StateSnapshot) {
|
||||
if ctx.is_null() || out.is_null() { return; }
|
||||
unsafe { *out = (*ctx).get_snapshot(); }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn wave_set_sgpr(ctx: *mut WaveContext, idx: u32, val: u32) {
|
||||
if ctx.is_null() || idx >= 128 { return; }
|
||||
unsafe { (*ctx).scalar_reg[idx as usize] = val; }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn wave_set_vgpr(ctx: *mut WaveContext, lane: u32, idx: u32, val: u32) {
|
||||
if ctx.is_null() || lane >= 32 || idx >= 256 { return; }
|
||||
unsafe { (*ctx).vec_reg.get_lane_mut(lane as usize)[idx as usize] = val; }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn wave_init_lds(ctx: *mut WaveContext, size: u32) {
|
||||
if ctx.is_null() { return; }
|
||||
unsafe { (*ctx).lds.data.resize(size as usize, 0); }
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn wave_free(ctx: *mut WaveContext) {
|
||||
if !ctx.is_null() { unsafe { drop(Box::from_raw(ctx)); } }
|
||||
}
|
||||
|
|
@ -1,223 +0,0 @@
|
|||
use crate::helpers::{bits, sign_ext};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub enum Instruction {
|
||||
SOP2 { op: u8, ssrc0: u8, ssrc1: u8, sdst: u8 },
|
||||
SOP1 { op: u8, ssrc0: u8, sdst: u8 },
|
||||
SOPK { op: u8, simm16: i16, sdst: u8 },
|
||||
SOPP { op: u8, simm16: i16 },
|
||||
SOPC { op: u8, ssrc0: u8, ssrc1: u8 },
|
||||
|
||||
SMEM { op: u8, sdata: u8, sbase: u8, offset: i32, soffset: u8, glc: bool, dlc: bool },
|
||||
|
||||
VOP1 { op: u8, vdst: u8, src: u16 },
|
||||
VOP2 { op: u8, vdst: u8, vsrc: u8, src: u16 },
|
||||
VOPC { op: u8, vsrc: u8, src: u16 },
|
||||
VOP3 { op: u32, opsel: u8, cm: bool, abs: u8, vdst: u8, neg: u8, omod: u8, src2: u16, src1: u16, src0: u16 },
|
||||
VOP3SD { op: u32, cm: bool, sdst: u8, vdst: u8, neg: u8, omod: u8, src2: u16, src1: u16, src0: u16 },
|
||||
VOP3P { op: u8, vdst: u8, neg_hi: u8, opsel: u8, opsel_hi: u8, opsel_hi2: bool, cm: bool, src2: u16, src1: u16, src0: u16, neg: u8 },
|
||||
VOPD { opx: u8, opy: u8, vdstx: u8, vdsty: u8, vsrcx1: u8, vsrcy1: u8, srcx0: u16, srcy0: u16 },
|
||||
|
||||
DS { op: u8, gds: bool, offset1: u8, offset0: u8, vdst: u8, data1: u8, data0: u8, addr: u8 },
|
||||
|
||||
FLAT { op: u8, offset: u16, dlc: bool, glc: bool, slc: bool, seg: u8, addr: u8, data: u8, saddr: u8, sve: bool, vdst: u8 }
|
||||
}
|
||||
|
||||
const VOP3SD_OPS: [u32; 7] = [764, 765, 766, 767, 768, 769, 770];
|
||||
|
||||
pub fn decode(word:u32, word1:Option<&u32>) -> Instruction {
|
||||
match bits(word, 31, 30) {
|
||||
0b11 => {
|
||||
let word = (*word1.unwrap() as u64) << 32 | (word as u64);
|
||||
match bits(word, 29, 26) {
|
||||
0b1101 => {
|
||||
let sbase = (bits(word, 5, 0) as u8) << 1;
|
||||
let sdata = bits(word, 12, 6) as u8;
|
||||
let dlc = bits(word, 13, 13) != 0;
|
||||
let glc = bits(word, 14, 14) != 0;
|
||||
let op = bits(word, 25, 18) as u8;
|
||||
let offset = sign_ext(bits(word, 52, 32), 21) as i32;
|
||||
let soffset = bits(word, 63, 57) as u8;
|
||||
Instruction::SMEM { sbase, sdata, dlc, glc, op, offset, soffset }
|
||||
}
|
||||
0b0101 => {
|
||||
let op = bits(word, 25, 16) as u32;
|
||||
let vdst = bits(word, 7, 0) as u8;
|
||||
let cm = bits(word, 15, 15) != 0;
|
||||
let src0 = bits(word, 40, 32) as u16;
|
||||
let src1 = bits(word, 49, 41) as u16;
|
||||
let src2 = bits(word, 58, 50) as u16;
|
||||
let omod = bits(word, 60, 59) as u8;
|
||||
let neg = bits(word, 63, 61) as u8;
|
||||
if VOP3SD_OPS.contains(&op) {
|
||||
let sdst = bits(word, 14, 8) as u8;
|
||||
Instruction::VOP3SD { op, vdst, sdst, cm, src0, src1, src2, omod, neg }
|
||||
} else {
|
||||
let abs = bits(word, 10, 8) as u8;
|
||||
let opsel = bits(word, 14, 11) as u8;
|
||||
Instruction::VOP3 { opsel, cm, abs, vdst, neg, omod, src2, src1, src0, op }
|
||||
}
|
||||
}
|
||||
0b0011 => {
|
||||
let op = bits(word, 22, 16) as u8;
|
||||
let vdst = bits(word, 7, 0) as u8;
|
||||
let neg_hi = bits(word, 10, 8) as u8;
|
||||
let opsel = bits(word, 13, 11) as u8;
|
||||
let opsel_hi2 = bits(word, 14, 14) != 0;
|
||||
let cm = bits(word, 15, 15) != 0;
|
||||
let src0 = bits(word, 40, 32) as u16;
|
||||
let src1 = bits(word, 49, 41) as u16;
|
||||
let src2 = bits(word, 58, 50) as u16;
|
||||
let opsel_hi = bits(word, 60, 59) as u8;
|
||||
let neg = bits(word, 63, 61) as u8;
|
||||
Instruction::VOP3P { op, vdst, neg_hi, opsel, opsel_hi, opsel_hi2, cm, src0, src1, src2, neg }
|
||||
}
|
||||
0b0110 => {
|
||||
let offset0 = bits(word, 7, 0) as u8;
|
||||
let offset1 = bits(word, 15, 8) as u8;
|
||||
let gds = bits(word, 17, 17) != 0;
|
||||
let op = bits(word, 25, 18) as u8;
|
||||
let addr = bits(word, 39, 32) as u8;
|
||||
let data0 = bits(word, 47, 40) as u8;
|
||||
let data1 = bits(word, 55, 48) as u8;
|
||||
let vdst = bits(word, 63, 56) as u8;
|
||||
Instruction::DS { op, gds, offset1, offset0, vdst, data1, data0, addr }
|
||||
}
|
||||
0b0111 => {
|
||||
let offset = bits(word, 12, 0) as u16;
|
||||
let dlc = bits(word, 13, 13) != 0;
|
||||
let glc = bits(word, 14, 14) != 0;
|
||||
let slc = bits(word, 15, 15) != 0;
|
||||
let seg = bits(word, 17, 16) as u8;
|
||||
let op = bits(word, 24, 18) as u8;
|
||||
let addr = bits(word, 39, 32) as u8;
|
||||
let data = bits(word, 47, 40) as u8;
|
||||
let saddr = bits(word, 54, 48) as u8;
|
||||
let sve = bits(word, 55, 55) != 0;
|
||||
let vdst = bits(word, 63, 56) as u8;
|
||||
Instruction::FLAT { offset, dlc, glc, slc, seg, op, addr, data, saddr, sve, vdst }
|
||||
},
|
||||
0b0010 => {
|
||||
let srcx0 = bits(word, 8, 0) as u16;
|
||||
let vsrcx1 = bits(word, 16, 9) as u8;
|
||||
let opy = bits(word, 21, 17) as u8;
|
||||
let opx = bits(word, 25, 22) as u8;
|
||||
let srcy0 = bits(word, 40, 32) as u16;
|
||||
let vsrcy1 = bits(word, 48, 41) as u8;
|
||||
let vdsty = bits(word, 55, 49) as u8;
|
||||
let vdstx = bits(word, 63, 56) as u8;
|
||||
Instruction::VOPD { opx, opy, vdstx, vdsty, vsrcx1, vsrcy1, srcx0, srcy0 }
|
||||
}
|
||||
_ => todo!(),
|
||||
}
|
||||
}
|
||||
0b10 => {
|
||||
let ssrc0 = bits(word, 7, 0) as u8;
|
||||
let ssrc1 = bits(word, 15, 8) as u8;
|
||||
let simm16 = word as i16;
|
||||
let sdst = bits(word, 22, 16) as u8;
|
||||
match bits(word, 29, 23) {
|
||||
0b1111101 => Instruction::SOP1 { ssrc0, sdst, op: bits(word, 15, 8) as u8 },
|
||||
0b1111110 => Instruction::SOPC { ssrc0, ssrc1, op: bits(word, 22, 16) as u8 },
|
||||
0b1111111 => Instruction::SOPP { simm16, op: bits(word, 22, 16) as u8 },
|
||||
_ => {
|
||||
match bits(word, 29, 28) {
|
||||
0b11 => Instruction::SOPK { simm16, sdst, op: bits(word, 27, 23) as u8 },
|
||||
_ => Instruction::SOP2 { ssrc0, ssrc1, sdst, op: bits(word, 29, 23) as u8 }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
let vdst = bits(word, 24, 17) as u8;
|
||||
let src = bits(word, 8, 0) as u16;
|
||||
let vsrc = bits(word, 16, 9) as u8;
|
||||
match bits(word, 30, 25) {
|
||||
0b111110 => Instruction::VOPC { vsrc, src, op: bits(word, 24, 17) as u8 },
|
||||
0b111111 => Instruction::VOP1 { vdst, src, op: vsrc },
|
||||
_ => Instruction::VOP2 { vdst, vsrc, src, op: bits(word, 30, 25) as u8 },
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_rdna3 {
|
||||
use super::*;
|
||||
|
||||
use std::process::{Stdio, Command};
|
||||
use std::io::{Result, Write};
|
||||
|
||||
const LLVM_ARGS: &[&str; 3] = &["--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa"];
|
||||
const OFFSET_PRG: usize = 16;
|
||||
const NULL: u8 = 124;
|
||||
|
||||
fn llvm_assemble(asm: &str) -> Result<Vec<u8>> {
|
||||
let mut proc = Command::new("llvm-mc").args(LLVM_ARGS).args(["-filetype=obj", "-o", "-"]).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn()?;
|
||||
proc.stdin.as_mut().unwrap().write_all(asm.as_bytes())?;
|
||||
let out = proc.wait_with_output()?;
|
||||
match out.status.success() {
|
||||
true => Ok(out.stdout),
|
||||
false => Err(std::io::Error::new(std::io::ErrorKind::Other, "llvm-mc err")),
|
||||
}
|
||||
}
|
||||
|
||||
fn llvm_disassemble(code: &Vec<u8>) -> Result<String> {
|
||||
let mut proc = Command::new("llvm-objdump").args(LLVM_ARGS).args(["--disassemble", "-"]).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn()?;
|
||||
proc.stdin.as_mut().unwrap().write_all(code)?;
|
||||
let out = proc.wait_with_output()?;
|
||||
match out.status.success() {
|
||||
true => Ok(String::from_utf8(out.stdout).unwrap()),
|
||||
false => Err(std::io::Error::new(std::io::ErrorKind::Other, "llvm-objdump err")),
|
||||
}
|
||||
}
|
||||
|
||||
fn test_decode(asm: &str) -> Instruction {
|
||||
let lib = llvm_assemble(asm).unwrap();
|
||||
println!("{}", llvm_disassemble(&lib).unwrap());
|
||||
let stream: Vec<u32> = lib.chunks_exact(4).map(|chunk| u32::from_le_bytes(chunk.try_into().unwrap())).skip(OFFSET_PRG).collect();
|
||||
decode(stream[0], stream.get(1))
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_smem() {
|
||||
assert_eq!(test_decode("s_load_b128 s[4:7], s[0:1], null"), Instruction::SMEM { op: 2, sdata: 4, sbase: 0, offset: 0, soffset: NULL, glc: false, dlc: false });
|
||||
assert_eq!(test_decode("s_load_b32 s10, s[0:1], 0xc"), Instruction::SMEM { op: 0, sdata: 10, sbase: 0, offset: 0xc, soffset: NULL, glc: false, dlc: false });
|
||||
assert_eq!(test_decode("s_load_b32 s0, s[4:5], s6"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: 6, glc: false, dlc: false });
|
||||
assert_eq!(test_decode("s_load_b32 s0, s[4:5], glc dlc"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: NULL, glc: true, dlc: true });
|
||||
assert_eq!(test_decode("s_load_b32 s0, s[4:5], glc"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: NULL, glc: true, dlc: false });
|
||||
assert_eq!(test_decode("s_load_b32 s0, s[4:5], -20"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: -20, soffset: NULL, glc: false, dlc: false });
|
||||
assert_eq!(test_decode("s_load_b32 s0, s[4:5], -1048576"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: -1048576, soffset: NULL, glc: false, dlc: false });
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_salu() {
|
||||
assert_eq!(test_decode("s_add_u32 s1 s2 s3"), Instruction::SOP2 { op: 0, ssrc0: 2, ssrc1: 3, sdst: 1 });
|
||||
assert_eq!(test_decode("s_add_u32 vcc_hi exec_lo vcc_lo"), Instruction::SOP2 { op: 0, ssrc0: 126, ssrc1: 106, sdst: 107 });
|
||||
assert_eq!(test_decode("s_mov_b32 s1 -0.5"), Instruction::SOP1 { op: 0, ssrc0: 241, sdst: 1 });
|
||||
assert_eq!(test_decode("s_cmpk_eq_i32 s0 -30"), Instruction::SOPK { op: 3, sdst: 0, simm16: -30 });
|
||||
assert_eq!(test_decode("s_cmpk_eq_u32 s0 65535"), Instruction::SOPK { op: 9, sdst: 0, simm16: -1 });
|
||||
assert_eq!(test_decode("s_cmp_ge_i32 s1 s2"), Instruction::SOPC { op: 3, ssrc0: 1, ssrc1: 2 });
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_valu_e32() {
|
||||
assert_eq!(test_decode("v_mov_b32 v0, v0"), Instruction::VOP1 { op: 1, vdst: 0, src: 256 });
|
||||
assert_eq!(test_decode("v_mov_b32 v0, s0"), Instruction::VOP1 { op: 1, vdst: 0, src: 0 });
|
||||
assert_eq!(test_decode("v_cmp_t_f32 v1, v0"), Instruction::VOPC { op: 31, vsrc: 0, src: 257 });
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_valu_e64() {
|
||||
assert_eq!(test_decode("v_log_f32_e64 v2, |v0|"), Instruction::VOP3 { op: 423, vdst: 2, src0: 256, src1: 0, src2: 0, abs: 0b001, neg: 0, opsel: 0, omod: 0, cm: false });
|
||||
assert_eq!(test_decode("v_div_scale_f32 v2, s1, v0, v1, v2"), Instruction::VOP3SD { op: 764, cm: false, vdst: 2, sdst: 1, src0: 256, src1: 257, src2: 258, omod: 0, neg: 0 });
|
||||
assert_eq!(test_decode("v_pk_add_i16 v1, v0, v2"), Instruction::VOP3P { op: 2, vdst: 1, neg_hi: 0, opsel: 0, opsel_hi: 3, opsel_hi2: true, cm: false, src2: 0, src1: 258, src0: 256, neg: 0 });
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_decode_ds() {
|
||||
assert_eq!(test_decode("ds_add_u32 v2, v4 offset:16"), Instruction::DS { op: 0, gds: false, offset1: 0, offset0: 0x10, vdst: 0, data1: 0, data0: 4, addr: 2 });
|
||||
assert_eq!(test_decode("ds_store_b32 v0, v1, offset: 0x04 gds"), Instruction::DS { op: 13, gds: true, offset1: 0, offset0: 0x04, vdst: 0, data1: 0, data0: 1, addr: 0 });
|
||||
assert_eq!(test_decode("ds_load_u8 v1, v0 offset:16"), Instruction::DS { op: 58, gds: false, offset1: 0, offset0: 16, vdst: 1, data1: 0, data0: 0, addr: 0 });
|
||||
}
|
||||
}
|
||||
|
|
@ -1,272 +0,0 @@
|
|||
use std::ops::{Index, IndexMut};
|
||||
|
||||
pub trait Register {
|
||||
fn read64(&self, idx: usize) -> u64;
|
||||
fn write64(&mut self, idx: usize, addr: u64);
|
||||
}
|
||||
impl<T> Register for T where T: Index<usize, Output = u32> + IndexMut<usize> {
|
||||
fn read64(&self, idx: usize) -> u64 {
|
||||
let lsb = self[idx] as u64;
|
||||
let msb = self[idx + 1] as u64;
|
||||
(msb << 32) | lsb
|
||||
}
|
||||
|
||||
fn write64(&mut self, idx: usize, value: u64) {
|
||||
self[idx] = (value & 0xffffffff) as u32;
|
||||
self[idx + 1] = ((value & (0xffffffff << 32)) >> 32) as u32;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct VGPR {
|
||||
values: [[u32; 256]; 32],
|
||||
pub default_lane: Option<usize>,
|
||||
}
|
||||
impl Index<usize> for VGPR {
|
||||
type Output = u32;
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
&self.values[self.default_lane.unwrap()][index]
|
||||
}
|
||||
}
|
||||
impl IndexMut<usize> for VGPR {
|
||||
fn index_mut(&mut self, index: usize) -> &mut Self::Output {
|
||||
&mut self.values[self.default_lane.unwrap()][index]
|
||||
}
|
||||
}
|
||||
impl VGPR {
|
||||
pub fn new() -> Self {
|
||||
VGPR {
|
||||
values: [[0; 256]; 32],
|
||||
default_lane: None,
|
||||
}
|
||||
}
|
||||
pub fn get_lane(&self, lane: usize) -> [u32; 256] {
|
||||
*self.values.get(lane).unwrap()
|
||||
}
|
||||
pub fn get_lane_mut(&mut self, lane: usize) -> &mut [u32; 256] {
|
||||
self.values.get_mut(lane).unwrap()
|
||||
}
|
||||
}
|
||||
|
||||
pub trait Value {
|
||||
fn mut_hi16(&mut self, val: u16);
|
||||
fn mut_lo16(&mut self, val: u16);
|
||||
}
|
||||
impl Value for u32 {
|
||||
fn mut_hi16(&mut self, val: u16) {
|
||||
*self = ((val as u32) << 16) | (*self as u16 as u32);
|
||||
}
|
||||
fn mut_lo16(&mut self, val: u16) {
|
||||
*self = ((((*self & (0xffff << 16)) >> 16) as u32) << 16) | val as u32;
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct WaveValue {
|
||||
pub value: u32,
|
||||
pub warp_size: usize,
|
||||
pub default_lane: Option<usize>,
|
||||
pub mutations: Option<[bool; 32]>,
|
||||
}
|
||||
impl WaveValue {
|
||||
pub fn new(value: u32, warp_size: usize) -> Self {
|
||||
Self {
|
||||
value,
|
||||
warp_size,
|
||||
default_lane: None,
|
||||
mutations: None,
|
||||
}
|
||||
}
|
||||
pub fn read(&self) -> bool {
|
||||
(self.value >> self.default_lane.unwrap()) & 1 == 1
|
||||
}
|
||||
pub fn set_lane(&mut self, value: bool) {
|
||||
if self.mutations.is_none() {
|
||||
self.mutations = Some([false; 32])
|
||||
}
|
||||
self.mutations.as_mut().unwrap()[self.default_lane.unwrap()] = value;
|
||||
}
|
||||
pub fn apply_muts(&mut self) {
|
||||
self.value = 0;
|
||||
for lane in 0..self.warp_size {
|
||||
if self.mutations.unwrap()[lane] {
|
||||
self.value |= 1 << lane;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// C-compatible state snapshot for FFI - used for comparing emulator states
|
||||
#[repr(C)]
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct StateSnapshot {
|
||||
pub pc: u32,
|
||||
pub scc: u32,
|
||||
pub vcc: u32,
|
||||
pub exec_mask: u32,
|
||||
pub sgpr: [u32; 128],
|
||||
pub vgpr: [[u32; 256]; 32],
|
||||
}
|
||||
|
||||
impl StateSnapshot {
|
||||
pub fn new() -> Self {
|
||||
Self { pc: 0, scc: 0, vcc: 0, exec_mask: 0, sgpr: [0; 128], vgpr: [[0; 256]; 32] }
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct VecDataStore {
|
||||
pub data: Vec<u8>,
|
||||
}
|
||||
|
||||
impl VecDataStore {
|
||||
pub fn new() -> Self {
|
||||
Self { data: Vec::new() }
|
||||
}
|
||||
pub fn write(&mut self, addr: usize, val: u32) {
|
||||
if addr + 4 >= self.data.len() {
|
||||
self.data.resize(self.data.len() + addr + 5, 0);
|
||||
}
|
||||
self.data[addr..addr + 4].iter_mut().enumerate().for_each(|(i, x)| {
|
||||
*x = val.to_le_bytes()[i];
|
||||
});
|
||||
}
|
||||
pub fn write64(&mut self, addr: usize, val: u64) {
|
||||
self.write(addr, (val & 0xffffffff) as u32);
|
||||
self.write(addr + 4, ((val & (0xffffffff << 32)) >> 32) as u32);
|
||||
}
|
||||
pub fn read(&self, addr: usize) -> u32 {
|
||||
let mut bytes: [u8; 4] = [0; 4];
|
||||
bytes.copy_from_slice(&self.data[addr + 0..addr + 4]);
|
||||
u32::from_le_bytes(bytes)
|
||||
}
|
||||
pub fn read64(&mut self, addr: usize) -> u64 {
|
||||
let lsb = self.read(addr);
|
||||
let msb = self.read(addr + 4);
|
||||
((msb as u64) << 32) | lsb as u64
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_state {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_wave_value() {
|
||||
let mut val = WaveValue::new(0b11000000000000011111111111101110, 32);
|
||||
val.default_lane = Some(0);
|
||||
assert!(!val.read());
|
||||
val.default_lane = Some(31);
|
||||
assert!(val.read());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_small() {
|
||||
let mut val = WaveValue::new(0, 1);
|
||||
val.default_lane = Some(0);
|
||||
assert!(!val.read());
|
||||
assert_eq!(val.value, 0);
|
||||
val.set_lane(true);
|
||||
val.apply_muts();
|
||||
assert!(val.read());
|
||||
assert_eq!(val.value, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_small_alt() {
|
||||
let mut val = WaveValue::new(0, 2);
|
||||
val.default_lane = Some(0);
|
||||
assert!(!val.read());
|
||||
assert_eq!(val.value, 0);
|
||||
val.set_lane(true);
|
||||
val.apply_muts();
|
||||
assert!(val.read());
|
||||
assert_eq!(val.value, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_exec() {
|
||||
let warp_size = 32;
|
||||
let val = WaveValue::new(u32::MAX, warp_size);
|
||||
assert_eq!(val.value, u32::MAX);
|
||||
let warp_size = 3;
|
||||
let val = WaveValue::new((1 << warp_size) - 1, warp_size);
|
||||
assert_eq!(val.value, 7)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_toggle_one() {
|
||||
let warp_size = 2;
|
||||
let mut val = WaveValue::new(0b11, warp_size);
|
||||
// 0
|
||||
val.default_lane = Some(0);
|
||||
val.set_lane(false);
|
||||
// 1
|
||||
val.default_lane = Some(1);
|
||||
val.set_lane(true);
|
||||
val.apply_muts();
|
||||
assert_eq!(val.value, 2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_mutate_small() {
|
||||
let mut val = WaveValue::new(0, 2);
|
||||
val.default_lane = Some(0);
|
||||
assert!(!val.read());
|
||||
assert_eq!(val.value, 0);
|
||||
val.set_lane(true);
|
||||
val.apply_muts();
|
||||
assert!(val.read());
|
||||
assert_eq!(val.value, 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_mutations() {
|
||||
let mut val = WaveValue::new(0b10001, 32);
|
||||
val.default_lane = Some(0);
|
||||
val.set_lane(false);
|
||||
assert!(val.mutations.unwrap().iter().all(|x| !x));
|
||||
val.default_lane = Some(1);
|
||||
val.set_lane(true);
|
||||
assert_eq!(val.value, 0b10001);
|
||||
assert_eq!(
|
||||
val.mutations,
|
||||
Some([
|
||||
false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
|
||||
false, false, false, false, false, false, false, false, false, false, false, false, false,
|
||||
])
|
||||
);
|
||||
|
||||
val.apply_muts();
|
||||
assert_eq!(val.value, 0b10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write16() {
|
||||
let mut vgpr = VGPR::new();
|
||||
vgpr.default_lane = Some(0);
|
||||
vgpr[0] = 0b11100000000000001111111111111111;
|
||||
vgpr[0].mut_lo16(0b1011101111111110);
|
||||
assert_eq!(vgpr[0], 0b11100000000000001011101111111110);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_write16hi() {
|
||||
let mut vgpr = VGPR::new();
|
||||
vgpr.default_lane = Some(0);
|
||||
vgpr[0] = 0b11100000000000001111111111111111;
|
||||
vgpr[0].mut_hi16(0b1011101111111110);
|
||||
assert_eq!(vgpr[0], 0b10111011111111101111111111111111);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vgpr() {
|
||||
let mut vgpr = VGPR::new();
|
||||
vgpr.default_lane = Some(0);
|
||||
vgpr[0] = 42;
|
||||
vgpr.default_lane = Some(10);
|
||||
vgpr[0] = 10;
|
||||
assert_eq!(vgpr.get_lane(0)[0], 42);
|
||||
assert_eq!(vgpr.get_lane(10)[0], 10);
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,323 +0,0 @@
|
|||
use crate::helpers::{colored, DEBUG};
|
||||
use crate::state::{Register, StateSnapshot, VecDataStore, WaveValue, VGPR};
|
||||
use crate::thread::{Thread, END_PRG, SGPR_COUNT};
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub const WAVE_SIZE: usize = 32;
|
||||
|
||||
pub struct WorkGroup<'a> {
|
||||
dispatch_dim: u32,
|
||||
id: [u32; 3],
|
||||
lds: VecDataStore,
|
||||
kernel: &'a Vec<u32>,
|
||||
kernel_args: *const u64,
|
||||
launch_bounds: [u32; 3],
|
||||
wave_state: HashMap<usize, WaveState>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct WaveState {
|
||||
scalar_reg: [u32; SGPR_COUNT],
|
||||
scc: u32,
|
||||
vcc: WaveValue,
|
||||
exec: WaveValue,
|
||||
vec_reg: VGPR,
|
||||
pc: usize,
|
||||
sds: HashMap<usize, VecDataStore>,
|
||||
}
|
||||
|
||||
const SYNCS: [u32; 4] = [0xBF89FC07, 0xBC7C0000, 0xBF890007, 0xbFB60003];
|
||||
const S_BARRIER: u32 = 0xBFBD0000;
|
||||
|
||||
/// Context for single-stepping through a wave - holds all mutable state
|
||||
pub struct WaveContext {
|
||||
pub kernel: Vec<u32>,
|
||||
pub scalar_reg: [u32; SGPR_COUNT],
|
||||
pub scc: u32,
|
||||
pub pc: usize,
|
||||
pub vec_reg: VGPR,
|
||||
pub vcc: WaveValue,
|
||||
pub exec: WaveValue,
|
||||
pub lds: VecDataStore,
|
||||
pub sds: HashMap<usize, VecDataStore>,
|
||||
pub n_lanes: usize,
|
||||
}
|
||||
|
||||
impl WaveContext {
|
||||
pub fn new(kernel: Vec<u32>, n_lanes: usize) -> Self {
|
||||
let active = (!0u32).wrapping_shr(32 - (n_lanes as u32));
|
||||
Self {
|
||||
kernel,
|
||||
scalar_reg: [0; SGPR_COUNT],
|
||||
scc: 0,
|
||||
pc: 0,
|
||||
vec_reg: VGPR::new(),
|
||||
vcc: WaveValue::new(0, n_lanes),
|
||||
exec: WaveValue::new(active, n_lanes),
|
||||
lds: VecDataStore::new(),
|
||||
sds: (0..=31).map(|i| (i, VecDataStore::new())).collect(),
|
||||
n_lanes,
|
||||
}
|
||||
}
|
||||
|
||||
/// Execute a single instruction. Returns: 0=continue, -1=endpgm, -2=barrier, 1=done (pc past program), negative=error
|
||||
pub fn step(&mut self) -> i32 {
|
||||
if self.pc >= self.kernel.len() { return 1; }
|
||||
if self.kernel[self.pc] == END_PRG { return -1; }
|
||||
if self.kernel[self.pc] == S_BARRIER { self.pc += 1; return -2; }
|
||||
// Skip sync/nop instructions
|
||||
if SYNCS.contains(&self.kernel[self.pc]) || self.kernel[self.pc] >> 20 == 0xbf8 || self.kernel[self.pc] == 0x7E000000 {
|
||||
self.pc += 1;
|
||||
return 0;
|
||||
}
|
||||
|
||||
let mut sgpr_co = None;
|
||||
for lane_id in 0..self.n_lanes {
|
||||
self.vec_reg.default_lane = Some(lane_id);
|
||||
self.vcc.default_lane = Some(lane_id);
|
||||
self.exec.default_lane = Some(lane_id);
|
||||
let mut thread = Thread {
|
||||
scalar_reg: &mut self.scalar_reg,
|
||||
scc: &mut self.scc,
|
||||
vec_reg: &mut self.vec_reg,
|
||||
vcc: &mut self.vcc,
|
||||
exec: &mut self.exec,
|
||||
lds: &mut self.lds,
|
||||
sds: &mut self.sds.get_mut(&lane_id).unwrap(),
|
||||
pc_offset: 0,
|
||||
stream: self.kernel[self.pc..].to_vec(),
|
||||
scalar: false,
|
||||
simm: None,
|
||||
warp_size: self.n_lanes,
|
||||
sgpr_co: &mut sgpr_co,
|
||||
};
|
||||
if let Err(e) = thread.interpret() { return e; }
|
||||
if thread.scalar {
|
||||
self.pc = ((self.pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
|
||||
break;
|
||||
}
|
||||
if lane_id == self.n_lanes - 1 {
|
||||
self.pc = ((self.pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
|
||||
}
|
||||
}
|
||||
if self.vcc.mutations.is_some() { self.vcc.apply_muts(); self.vcc.mutations = None; }
|
||||
if self.exec.mutations.is_some() { self.exec.apply_muts(); self.exec.mutations = None; }
|
||||
if let Some((idx, mut wv)) = sgpr_co.take() { wv.apply_muts(); self.scalar_reg[idx] = wv.value; }
|
||||
0
|
||||
}
|
||||
|
||||
pub fn get_snapshot(&self) -> StateSnapshot {
|
||||
let mut snap = StateSnapshot::new();
|
||||
snap.pc = self.pc as u32;
|
||||
snap.scc = self.scc;
|
||||
snap.vcc = self.vcc.value;
|
||||
snap.exec_mask = self.exec.value;
|
||||
snap.sgpr = self.scalar_reg;
|
||||
for lane in 0..32 { snap.vgpr[lane] = self.vec_reg.get_lane(lane); }
|
||||
snap
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> WorkGroup<'a> {
|
||||
pub fn new(dispatch_dim: u32, id: [u32; 3], launch_bounds: [u32; 3], kernel: &'a Vec<u32>, kernel_args: *const u64) -> Self {
|
||||
Self { dispatch_dim, id, kernel, launch_bounds, kernel_args, lds: VecDataStore::new(), wave_state: HashMap::new() }
|
||||
}
|
||||
|
||||
pub fn exec_waves(&mut self) -> Result<(), i32> {
|
||||
let mut threads = vec![];
|
||||
for z in 0..self.launch_bounds[2] {
|
||||
for y in 0..self.launch_bounds[1] {
|
||||
for x in 0..self.launch_bounds[0] {
|
||||
threads.push([x, y, z])
|
||||
}
|
||||
}
|
||||
}
|
||||
let waves = threads.chunks(WAVE_SIZE).collect::<Vec<_>>();
|
||||
|
||||
let mut sync = false;
|
||||
for (i, x) in self.kernel.iter().enumerate() {
|
||||
if i != 0 && *x == S_BARRIER {
|
||||
sync = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
for _ in 0..=(sync as usize) {
|
||||
for w in waves.iter().enumerate() {
|
||||
self.exec_wave(w)?
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn exec_wave(&mut self, (wave_id, threads): (usize, &&[[u32; 3]])) -> Result<(), i32> {
|
||||
let (mut scalar_reg, mut scc, mut pc, mut vec_reg, mut vcc, mut exec, mut sds) = match self.wave_state.get(&wave_id) {
|
||||
None => {
|
||||
let mut scalar_reg = [0; SGPR_COUNT];
|
||||
scalar_reg.write64(0, self.kernel_args as u64);
|
||||
|
||||
let [gx, gy, gz] = self.id;
|
||||
match self.dispatch_dim {
|
||||
3 => (scalar_reg[13], scalar_reg[14], scalar_reg[15]) = (gx, gy, gz),
|
||||
2 => (scalar_reg[14], scalar_reg[15]) = (gx, gy),
|
||||
_ => scalar_reg[15] = gx,
|
||||
}
|
||||
|
||||
let mut vec_reg = VGPR::new();
|
||||
for (t, [x, y, z]) in threads.iter().enumerate() {
|
||||
vec_reg.get_lane_mut(t)[0] = match &self.launch_bounds {
|
||||
[_, 1, 1] => *x,
|
||||
_ => (z << 20) | (y << 10) | x,
|
||||
}
|
||||
}
|
||||
|
||||
let vcc = WaveValue::new(0, threads.len());
|
||||
let active = (!0u32).wrapping_shr(32 - (threads.len() as u32));
|
||||
let exec = WaveValue::new(active, threads.len());
|
||||
|
||||
let sds = (0..=31).map(|i| (i, VecDataStore::new())).collect();
|
||||
(scalar_reg, 0, 0, vec_reg, vcc, exec, sds)
|
||||
}
|
||||
|
||||
Some(val) => {
|
||||
let val = val.clone();
|
||||
(val.scalar_reg, val.scc, val.pc, val.vec_reg, val.vcc, val.exec, val.sds)
|
||||
}
|
||||
};
|
||||
|
||||
loop {
|
||||
if self.kernel[pc] == END_PRG {
|
||||
break Ok(());
|
||||
}
|
||||
if self.kernel[pc] == S_BARRIER && self.wave_state.get(&wave_id).is_none() {
|
||||
self.wave_state.insert(wave_id, WaveState { scalar_reg, scc, vec_reg, vcc, exec, pc, sds });
|
||||
break Ok(());
|
||||
}
|
||||
if self.kernel[pc] == S_BARRIER || SYNCS.contains(&self.kernel[pc]) || self.kernel[pc] >> 20 == 0xbf8 || self.kernel[pc] == 0x7E000000 {
|
||||
pc += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
let mut sgpr_co = None;
|
||||
for (lane_id, [x, y, z]) in threads.iter().enumerate() {
|
||||
vec_reg.default_lane = Some(lane_id);
|
||||
vcc.default_lane = Some(lane_id);
|
||||
exec.default_lane = Some(lane_id);
|
||||
if *DEBUG {
|
||||
let lane = format!("{:<2} {:08X} ", lane_id, self.kernel[pc]);
|
||||
let state = match exec.read() {
|
||||
true => "green",
|
||||
false => "gray",
|
||||
};
|
||||
let [id0, id1, id2] = self.id;
|
||||
print!("[{id0:<3} {id1:<3} {id2:<3}] [{x:<3} {y:<3} {z:<3}] {}", colored(&lane, state));
|
||||
}
|
||||
let mut thread = Thread {
|
||||
scalar_reg: &mut scalar_reg,
|
||||
scc: &mut scc,
|
||||
vec_reg: &mut vec_reg,
|
||||
vcc: &mut vcc,
|
||||
exec: &mut exec,
|
||||
lds: &mut self.lds,
|
||||
sds: &mut sds.get_mut(&lane_id).unwrap(),
|
||||
pc_offset: 0,
|
||||
stream: self.kernel[pc..self.kernel.len()].to_vec(),
|
||||
scalar: false,
|
||||
simm: None,
|
||||
warp_size: threads.len(),
|
||||
sgpr_co: &mut sgpr_co,
|
||||
};
|
||||
thread.interpret()?;
|
||||
if *DEBUG {
|
||||
println!();
|
||||
}
|
||||
if thread.scalar {
|
||||
pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
|
||||
break;
|
||||
}
|
||||
if lane_id == threads.len() - 1 {
|
||||
pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
|
||||
}
|
||||
}
|
||||
|
||||
if vcc.mutations.is_some() {
|
||||
vcc.apply_muts();
|
||||
vcc.mutations = None;
|
||||
}
|
||||
if exec.mutations.is_some() {
|
||||
exec.apply_muts();
|
||||
exec.mutations = None;
|
||||
}
|
||||
if let Some((idx, mut wv)) = sgpr_co.take() {
|
||||
wv.apply_muts();
|
||||
scalar_reg[idx] = wv.value;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test_workgroup {
|
||||
use super::*;
|
||||
|
||||
// TODO: make this generic by adding the assembler
|
||||
fn global_store_sgpr(addr: u64, instructions: Vec<u32>, src: u32) -> Vec<u32> {
|
||||
[
|
||||
instructions,
|
||||
vec![
|
||||
0x7E020200 + src,
|
||||
0x7E0402FF,
|
||||
addr as u32,
|
||||
0x7E0602FF,
|
||||
(addr >> 32) as u32,
|
||||
0xDC6A0000,
|
||||
0x007C0102,
|
||||
],
|
||||
vec![END_PRG],
|
||||
]
|
||||
.concat()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_state_vcc() {
|
||||
let mut ret: u32 = 0;
|
||||
let kernel = vec![
|
||||
0xBEEA00FF,
|
||||
0b11111111111111111111111111111111, // initial vcc state
|
||||
0x7E140282,
|
||||
0x7C94010A, // cmp blockDim.x == 2
|
||||
];
|
||||
let addr = (&mut ret as *mut u32) as u64;
|
||||
let kernel = global_store_sgpr(addr, kernel, 106);
|
||||
let mut wg = WorkGroup::new(1, [0, 0, 0], [3, 1, 1], &kernel, [addr].as_ptr());
|
||||
wg.exec_waves().unwrap();
|
||||
assert_eq!(ret, 0b100);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_state_exec() {
|
||||
let mut ret: u32 = 0;
|
||||
let kernel = vec![
|
||||
0xBEFE00FF,
|
||||
0b11111111111111111111111111111111,
|
||||
0x7E140282,
|
||||
0x7D9C010A, // cmpx blockDim.x <= 2
|
||||
];
|
||||
let addr = (&mut ret as *mut u32) as u64;
|
||||
let kernel = global_store_sgpr(addr, kernel, 126);
|
||||
let mut wg = WorkGroup::new(1, [0, 0, 0], [4, 1, 1], &kernel, [addr].as_ptr());
|
||||
wg.exec_waves().unwrap();
|
||||
assert_eq!(ret, 0b0111);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_wave_value_sgpr_co() {
|
||||
let mut ret: u32 = 0;
|
||||
let kernel = vec![0xBE8D00FF, 0x7FFFFFFF, 0x7E1402FF, u32::MAX, 0xD700000A, 0x0002010A];
|
||||
let addr = (&mut ret as *mut u32) as u64;
|
||||
let kernel = global_store_sgpr(addr, kernel, 0);
|
||||
let mut wg = WorkGroup::new(1, [0, 0, 0], [5, 1, 1], &kernel, [addr].as_ptr());
|
||||
wg.exec_waves().unwrap();
|
||||
assert_eq!(ret, 0b11110);
|
||||
}
|
||||
}
|
||||
|
|
@ -1,155 +0,0 @@
|
|||
# ruff: noqa: F405, F403
|
||||
# allow define from star imports
|
||||
|
||||
import numpy as np
|
||||
import unittest
|
||||
import subprocess, struct, math, functools
|
||||
from tinygrad import Tensor, dtypes, Device
|
||||
from tinygrad.helpers import getenv
|
||||
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
from tinygrad.renderer.amd.asm import waitcnt
|
||||
|
||||
from test.testextra.test_cfg_viz import asm_kernel
|
||||
|
||||
def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]):
|
||||
out = Tensor([0]*n_threads, dtype=dtypes.uint32).realize()
|
||||
insts = [
|
||||
s_load_b64(s[0:1], s[0:1], NULL),
|
||||
*asm,
|
||||
v_lshlrev_b32_e32(v[0], 2, v[0]),
|
||||
s_waitcnt(simm16=waitcnt(lgkmcnt=0)),
|
||||
#global_store_b32(v[0], v[1], s[0:1]),
|
||||
global_store_b32(addr=v[0], data=vdst, saddr=s[0:1]),
|
||||
s_endpgm()
|
||||
]
|
||||
out = Tensor.custom_kernel(out, fxn=functools.partial(asm_kernel, name="test", insts=insts, device=out.device, n_threads=n_threads))[0]
|
||||
out.realize()
|
||||
return out.tolist()
|
||||
|
||||
def f16_to_bits(x:float) -> int: return struct.unpack('<H', struct.pack('<e', x))[0]
|
||||
def f32_from_bits(x:int) -> float: return struct.unpack('<f', struct.pack('<I', x))[0]
|
||||
def f32_to_bits(x:float) -> int: return struct.unpack('<I', struct.pack('<f', x))[0]
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")
|
||||
class TestHW(unittest.TestCase):
|
||||
def setUp(self):
|
||||
if getenv("MOCKGPU"): subprocess.run(["cargo", "build", "--release", "--manifest-path", "./extra/remu/Cargo.toml"], check=True)
|
||||
|
||||
def test_simple_v_mov(self):
|
||||
out = get_output([
|
||||
v_mov_b32_e32(v[1], 2),
|
||||
])
|
||||
self.assertEqual(out, [2])
|
||||
|
||||
def test_simple_s_mov(self):
|
||||
out = get_output([
|
||||
s_mov_b32(s[7], 0x7fffffff),
|
||||
v_mov_b32_e32(v[1], s[7]),
|
||||
])
|
||||
self.assertEqual(out, [0x7fffffff])
|
||||
|
||||
def test_exec_mov(self):
|
||||
out = get_output([
|
||||
v_mov_b32_e32(v[1], 42),
|
||||
s_mov_b32(EXEC_LO, 0b10),
|
||||
v_mov_b32_e32(v[1], 10),
|
||||
s_mov_b32(EXEC_LO, 0b11),
|
||||
], n_threads=2)
|
||||
np.testing.assert_equal(out, [42, 10])
|
||||
|
||||
def test_exec_cmp_vopc(self):
|
||||
out = get_output([
|
||||
s_mov_b32(VCC_LO, 0), # reset vcc
|
||||
v_mov_b32_e32(v[1], 42),
|
||||
v_mov_b32_e32(v[2], 10),
|
||||
s_mov_b32(EXEC_LO, 0b01),
|
||||
v_cmp_ne_u32_e32(v[1], v[2]),
|
||||
s_mov_b32(EXEC_LO, 0b11),
|
||||
v_mov_b32_e32(v[1], VCC_LO),
|
||||
], n_threads=2)[0]
|
||||
np.testing.assert_equal(out, 1)
|
||||
|
||||
def test_exec_cmpx_vop3(self):
|
||||
out = get_output([
|
||||
s_mov_b32(EXEC_LO, 0b11),
|
||||
v_mov_b32_e32(v[1], 42),
|
||||
v_mov_b32_e32(v[2], 10),
|
||||
s_mov_b32(EXEC_LO, 0b01),
|
||||
v_cmpx_ne_u32_e32(v[1], v[2]),
|
||||
s_mov_b32(s[10], EXEC_LO),
|
||||
s_mov_b32(EXEC_LO, 0b11),
|
||||
v_mov_b32_e32(v[1], s[10]),
|
||||
], n_threads=2)[0]
|
||||
np.testing.assert_equal(out & 0b11, 0b01)
|
||||
|
||||
def test_fmac_vop3_modifier(self):
|
||||
init_state = [
|
||||
v_mov_b32_e32(a:=v[1], f16_to_bits(4.0)),
|
||||
v_mov_b32_e32(b:=v[2], f16_to_bits(3.0)),
|
||||
v_mov_b32_e32(c:=v[3], f16_to_bits(2.0)),
|
||||
]
|
||||
def run_fmac(a, b): return get_output(init_state+[v_fmac_f16_e64(c, a, b)], vdst=c)[0]
|
||||
self.assertEqual(run_fmac(a, b), f16_to_bits(14.0))
|
||||
self.assertEqual(run_fmac(a, -b), f16_to_bits(-10.0))
|
||||
self.assertEqual(run_fmac(-a, -b), f16_to_bits(14.0))
|
||||
|
||||
def test_s_abs_i32(self):
|
||||
def check(x, y, dst=s[10], scc=0):
|
||||
for reg,val in [(dst, y), (SCC, scc)]:
|
||||
self.assertEqual(get_output([
|
||||
s_mov_b32(dst, x),
|
||||
s_abs_i32(dst, dst),
|
||||
v_mov_b32_e32(v[1], reg)
|
||||
])[0], val)
|
||||
|
||||
check(0x00000001, 0x00000001, scc=1)
|
||||
check(0x7fffffff, 0x7fffffff, scc=1)
|
||||
check(0x80000000, 0x80000000, scc=1)
|
||||
check(0x80000001, 0x7fffffff, scc=1)
|
||||
check(0x80000002, 0x7ffffffe, scc=1)
|
||||
check(0xffffffff, 0x00000001, scc=1)
|
||||
check(0, 0, scc=0)
|
||||
|
||||
def test_v_rcp_f32_neg_vop3(self):
|
||||
def v_neg_rcp_f32(x:float, y:float):
|
||||
out = get_output([
|
||||
v_mov_b32_e32(v[2], f32_to_bits(x)),
|
||||
v_rcp_f32_e64(v[2], -v[2]),
|
||||
], vdst=v[2])[0]
|
||||
assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
|
||||
|
||||
v_neg_rcp_f32(math.inf, -0.0)
|
||||
v_neg_rcp_f32(-math.inf, 0.0)
|
||||
v_neg_rcp_f32(0.0, -math.inf)
|
||||
v_neg_rcp_f32(-0.0, math.inf)
|
||||
v_neg_rcp_f32(-2.0, 0.5)
|
||||
v_neg_rcp_f32(2.0, -0.5)
|
||||
|
||||
def test_v_cndmask_b32_neg(self):
|
||||
def v_neg(x:float, y:float):
|
||||
out = get_output([
|
||||
v_mov_b32_e32(v[1], f32_to_bits(x)),
|
||||
s_mov_b32(s[10], 1),
|
||||
v_cndmask_b32_e64(v[1], v[1], -v[1], s[10]),
|
||||
])[0]
|
||||
assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
|
||||
|
||||
v_neg(-0.0, 0.0)
|
||||
v_neg(0.0, -0.0)
|
||||
v_neg(2.0, -2.0)
|
||||
v_neg(math.inf, -math.inf)
|
||||
v_neg(-math.inf, math.inf)
|
||||
|
||||
@unittest.skip("how does VOPD work in the dsl")
|
||||
def test_v_subrev_wrap(self):
|
||||
out = get_output([
|
||||
#v_dual_mov_b32(v[1], 0xffffffff, v[2], 0x0),
|
||||
#v_dual_mov_b32(vdstx=v[1], srcx=0xffffffff, vdsty=v[2], srcy=0x0),
|
||||
#VOPD(opx=VOPDOp.V_DUAL_MOV_B32, opy=VOPDOp.V_DUAL_MOV_B32, vdstx=v[1], srcx=0xffffffff, vdsty=v[2], srcy=0x0),
|
||||
v_subrev_co_u32(v[2], VCC_LO, v[2], v[1]),
|
||||
], vdst=v[2])[0]
|
||||
self.assertEqual(out, 0xffff_ffff)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
@ -3,7 +3,7 @@ INSTALL_PATH="${1:-/opt/homebrew/lib}"
|
|||
if [ ! -d "$INSTALL_PATH" ]; then
|
||||
USER=$(whoami)
|
||||
echo "No path $INSTALL_PATH. Will create. Might need your password..."
|
||||
echo "You can stop now and provide any location as an argument where you want to save the libs (note, that not default locations should be in LD_LIBRARY_PATH, so tinygrad can find the libs)."
|
||||
echo "You can stop now and provide any location as an argument where you want to save the library (note, that not default locations should be in LD_LIBRARY_PATH, so tinygrad can find it)."
|
||||
echo "Press any key or symbol to continue..."
|
||||
read -n 1 -s
|
||||
|
||||
|
|
@ -11,11 +11,6 @@ if [ ! -d "$INSTALL_PATH" ]; then
|
|||
sudo chown -R "$USER":staff "$INSTALL_PATH"
|
||||
fi
|
||||
|
||||
# Download libremu.dylib
|
||||
curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \
|
||||
jq -r '.assets[] | select(.name == "libremu.dylib").browser_download_url' | \
|
||||
xargs curl -L -o $INSTALL_PATH/libremu.dylib
|
||||
|
||||
# Download libamd_comgr.dylib
|
||||
curl -s https://api.github.com/repos/tinygrad/amdcomgr_dylib/releases/latest | \
|
||||
jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
|
||||
|
|
|
|||
|
|
@ -1,528 +0,0 @@
|
|||
# Test to compare Python and Rust RDNA3 emulators by running real tinygrad kernels
|
||||
import unittest, ctypes
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from tinygrad import Device
|
||||
|
||||
from test.mockgpu.amd.emu import WaveState, _decode_at, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
|
||||
from tinygrad.renderer.amd import decode_inst
|
||||
import tinygrad
|
||||
REMU_PATH = Path(tinygrad.__file__).parent.parent / "extra/remu/target/release/libremu.so"
|
||||
if not REMU_PATH.exists(): REMU_PATH = Path(tinygrad.__file__).parent.parent / "extra/remu/target/release/libremu.dylib"
|
||||
|
||||
def set_valid_mem_ranges(ranges): pass # emu2 doesn't need this
|
||||
|
||||
def _is_f32_nan(bits: int) -> bool:
|
||||
"""Check if 32-bit value is a NaN (exponent all 1s, mantissa non-zero)."""
|
||||
return (bits & 0x7f800000) == 0x7f800000 and (bits & 0x007fffff) != 0
|
||||
|
||||
def _vals_equal(a: int, b: int) -> bool:
|
||||
"""Compare two 32-bit values, treating all NaN bit patterns as equal."""
|
||||
if a == b: return True
|
||||
return _is_f32_nan(a) and _is_f32_nan(b)
|
||||
|
||||
@dataclass
|
||||
class KernelSnapshot:
|
||||
code: bytes
|
||||
src: str
|
||||
global_size: tuple[int, int, int]
|
||||
local_size: tuple[int, int, int]
|
||||
buf_idxs: list[int] # indices into shared buffer pool
|
||||
buf_sizes: list[int] # sizes for each buffer index
|
||||
|
||||
@dataclass
|
||||
class StateSnapshot:
|
||||
pc: int
|
||||
scc: int
|
||||
vcc: int
|
||||
exec_mask: int
|
||||
sgpr: list[int]
|
||||
vgpr: list[list[int]]
|
||||
|
||||
def diff(self, other: 'StateSnapshot', n_lanes: int, arrow: str = " vs ") -> list[str]:
|
||||
"""Return list of differences between two states."""
|
||||
diffs = []
|
||||
if self.pc != other.pc: diffs.append(f"pc: {self.pc}{arrow}{other.pc}")
|
||||
if self.scc != other.scc: diffs.append(f"scc: {self.scc}{arrow}{other.scc}")
|
||||
if self.vcc != other.vcc: diffs.append(f"vcc: 0x{self.vcc:08x}{arrow}0x{other.vcc:08x}")
|
||||
if self.exec_mask != other.exec_mask: diffs.append(f"exec: 0x{self.exec_mask:08x}{arrow}0x{other.exec_mask:08x}")
|
||||
for i, (a, b) in enumerate(zip(self.sgpr, other.sgpr)):
|
||||
# Skip VCC_LO/HI (106/107) and EXEC_LO/HI (126/127) as they alias vcc/exec_mask which are compared separately
|
||||
if i in (106, 107, 126, 127): continue
|
||||
if not _vals_equal(a, b): diffs.append(f"sgpr[{i}]: 0x{a:08x}{arrow}0x{b:08x}")
|
||||
for lane in range(n_lanes):
|
||||
for i, (a, b) in enumerate(zip(self.vgpr[lane], other.vgpr[lane])):
|
||||
if not _vals_equal(a, b): diffs.append(f"vgpr[{lane}][{i}]: 0x{a:08x}{arrow}0x{b:08x}")
|
||||
return diffs
|
||||
|
||||
class CStateSnapshot(ctypes.Structure):
|
||||
_fields_ = [("pc", ctypes.c_uint32), ("scc", ctypes.c_uint32), ("vcc", ctypes.c_uint32), ("exec_mask", ctypes.c_uint32),
|
||||
("sgpr", ctypes.c_uint32 * 128), ("vgpr", (ctypes.c_uint32 * 256) * 32)]
|
||||
|
||||
def to_snapshot(self) -> StateSnapshot:
|
||||
return StateSnapshot(pc=self.pc, scc=self.scc, vcc=self.vcc, exec_mask=self.exec_mask,
|
||||
sgpr=list(self.sgpr), vgpr=[list(self.vgpr[i]) for i in range(32)])
|
||||
|
||||
class RustEmulator:
|
||||
def __init__(self):
|
||||
self.lib = ctypes.CDLL(str(REMU_PATH))
|
||||
self.lib.wave_create.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32]
|
||||
self.lib.wave_create.restype = ctypes.c_void_p
|
||||
self.lib.wave_step.argtypes = [ctypes.c_void_p]
|
||||
self.lib.wave_step.restype = ctypes.c_int32
|
||||
self.lib.wave_get_snapshot.argtypes = [ctypes.c_void_p, ctypes.POINTER(CStateSnapshot)]
|
||||
self.lib.wave_set_sgpr.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32]
|
||||
self.lib.wave_set_vgpr.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32]
|
||||
self.lib.wave_init_lds.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
|
||||
self.lib.wave_free.argtypes = [ctypes.c_void_p]
|
||||
self.ctx = None
|
||||
|
||||
def create(self, kernel: bytes, n_lanes: int):
|
||||
kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
|
||||
self.ctx = self.lib.wave_create(ctypes.addressof(kernel_buf), len(kernel), n_lanes)
|
||||
self._kernel_buf = kernel_buf
|
||||
|
||||
def step(self) -> int: return self.lib.wave_step(self.ctx)
|
||||
def set_sgpr(self, idx: int, val: int): self.lib.wave_set_sgpr(self.ctx, idx, val)
|
||||
def set_vgpr(self, lane: int, idx: int, val: int): self.lib.wave_set_vgpr(self.ctx, lane, idx, val)
|
||||
def init_lds(self, size: int): self.lib.wave_init_lds(self.ctx, size)
|
||||
|
||||
def get_snapshot(self) -> StateSnapshot:
|
||||
snap = CStateSnapshot()
|
||||
self.lib.wave_get_snapshot(self.ctx, ctypes.byref(snap))
|
||||
return snap.to_snapshot()
|
||||
|
||||
def free(self):
|
||||
if self.ctx:
|
||||
self.lib.wave_free(self.ctx)
|
||||
self.ctx = None
|
||||
|
||||
class PythonEmulator:
|
||||
def __init__(self):
|
||||
self.state: WaveState | None = None
|
||||
self.program: dict[int, tuple] = {} # lazily populated: pc -> (name, fxn, globals)
|
||||
self.vmem_buf = None
|
||||
self.lds_buf = None
|
||||
self.kernel_buf = None # Keep kernel bytes alive
|
||||
self.lib_addr = 0 # Base address of kernel code
|
||||
|
||||
def create(self, kernel: bytes, n_lanes: int):
|
||||
import ctypes
|
||||
from tinygrad.device import Buffer, BufferSpec
|
||||
from tinygrad.dtype import dtypes
|
||||
# Store kernel in a ctypes buffer so _decode_at can read from memory at actual PC address
|
||||
self.kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
|
||||
self.lib_addr = ctypes.addressof(self.kernel_buf)
|
||||
self.program = {}
|
||||
self.state = WaveState(n_lanes)
|
||||
self.state.pc = self.lib_addr # Set PC to code base address
|
||||
self.vmem_buf = Buffer('CPU', 1 << 40, dtypes.uint32, options=BufferSpec(external_ptr=0)).ensure_allocated()
|
||||
self.lds_buf = Buffer('CPU', 65536 // 4, dtypes.uint32).ensure_allocated()
|
||||
|
||||
def _ensure_decoded(self, pc: int):
|
||||
if pc not in self.program:
|
||||
runner, _ = _decode_at(pc, "rdna3")
|
||||
self.program[pc] = (runner.p.function_name, runner._prg.fxn, runner.p.globals)
|
||||
|
||||
def step(self) -> int:
|
||||
import ctypes
|
||||
assert self.state is not None
|
||||
pc = self.state.pc
|
||||
if pc == 0xFFFFFFFFFFFFFFFF: return -1
|
||||
self._ensure_decoded(pc)
|
||||
name, fxn, globals_list = self.program[pc]
|
||||
buf_addrs = {0: self.state.sgpr_buf._buf.va_addr, 1: self.state.vgpr_buf._buf.va_addr, # type: ignore[union-attr]
|
||||
2: self.vmem_buf._buf.va_addr, 3: self.lds_buf._buf.va_addr} # type: ignore[union-attr]
|
||||
fxn(*[ctypes.c_uint64(buf_addrs[g]) for g in globals_list], ctypes.c_int32(0))
|
||||
return -1 if self.state.pc == 0xFFFFFFFFFFFFFFFF else 0
|
||||
|
||||
def set_sgpr(self, idx: int, val: int):
|
||||
assert self.state is not None
|
||||
self.state._write_sgpr(idx, val)
|
||||
def set_vgpr(self, lane: int, idx: int, val: int):
|
||||
assert self.state is not None
|
||||
self.state._write_vgpr(idx, lane, val)
|
||||
|
||||
def get_snapshot(self) -> StateSnapshot:
|
||||
assert self.state is not None
|
||||
sgpr = [self.state._read_sgpr(i) for i in range(128)]
|
||||
vgpr = [[self.state._read_vgpr(reg, lane) for reg in range(256)] for lane in range(WAVE_SIZE)]
|
||||
# Convert actual PC address to word offset for comparison with Rust emulator
|
||||
pc_offset = (self.state.pc - self.lib_addr) // 4 if self.state.pc != 0xFFFFFFFFFFFFFFFF else 0xFFFFFFFFFFFFFFFF
|
||||
return StateSnapshot(pc=pc_offset, scc=self.state._read_sgpr(SCC.offset), vcc=sgpr[VCC_LO.offset],
|
||||
exec_mask=sgpr[EXEC_LO.offset], sgpr=sgpr, vgpr=vgpr)
|
||||
|
||||
def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: tuple[int, int, int],
|
||||
local_size: tuple[int, int, int], max_steps: int, debug: bool, trace_len: int,
|
||||
kernel_idx: int = 0, max_workgroups: int = 8) -> tuple[bool, str, int]:
|
||||
"""Run a single kernel through both emulators. Returns (success, message, total_steps)."""
|
||||
gx, gy, gz = global_size
|
||||
lx, ly, lz = local_size
|
||||
total_steps = 0
|
||||
wg_count = 0
|
||||
|
||||
for gidz in range(gz):
|
||||
for gidy in range(gy):
|
||||
for gidx in range(gx):
|
||||
if wg_count >= max_workgroups: return True, f"Completed {wg_count} workgroups (limit reached)", total_steps
|
||||
wg_count += 1
|
||||
rust = RustEmulator()
|
||||
python = PythonEmulator()
|
||||
rust.create(kernel, n_lanes)
|
||||
python.create(kernel, n_lanes)
|
||||
|
||||
# Initialize LDS (64KB, standard size for AMD GPUs)
|
||||
rust.init_lds(65536)
|
||||
|
||||
for emu in (rust, python):
|
||||
emu.set_sgpr(0, args_ptr & 0xffffffff)
|
||||
emu.set_sgpr(1, (args_ptr >> 32) & 0xffffffff)
|
||||
emu.set_sgpr(13, gidx)
|
||||
emu.set_sgpr(14, gidy)
|
||||
emu.set_sgpr(15, gidz)
|
||||
# Initialize v[0] with packed workitem IDs for each lane
|
||||
for lane in range(n_lanes):
|
||||
tid = lane
|
||||
z, y, x = tid // (lx * ly), (tid // lx) % ly, tid % lx
|
||||
emu.set_vgpr(lane, 0, (z << 20) | (y << 10) | x)
|
||||
|
||||
step = 0
|
||||
trace: list[tuple[int, int, str, StateSnapshot, StateSnapshot]] = []
|
||||
prev_sync_after = False # Track if previous instruction had known Rust bugs
|
||||
try:
|
||||
while step < max_steps:
|
||||
rust_before = rust.get_snapshot()
|
||||
python_before = python.get_snapshot()
|
||||
|
||||
pc_addr = python.lib_addr + python_before.pc * 4 # Convert word offset to actual address
|
||||
python._ensure_decoded(pc_addr)
|
||||
inst_hex_name = python.program[pc_addr][0]
|
||||
# Decode the instruction to get mnemonic for sync_after checks
|
||||
try:
|
||||
# Format is mnemonic_hexbytes, e.g. v_exp_f32_e32_014b027e -> hex is 014b027e
|
||||
parts = inst_hex_name.rsplit('_', 1)
|
||||
inst_bytes_hex = parts[1] if len(parts) == 2 else ""
|
||||
inst_bytes = bytes.fromhex(inst_bytes_hex) if inst_bytes_hex else b''
|
||||
decoded = decode_inst(inst_bytes) if inst_bytes else None
|
||||
inst_mnemonic = repr(decoded).split('(')[0] if decoded else ""
|
||||
except Exception:
|
||||
inst_mnemonic = ""
|
||||
# For generic instructions, use function name for sync_after check
|
||||
if not inst_mnemonic: inst_mnemonic = inst_hex_name
|
||||
inst_str = inst_hex_name
|
||||
trace.append((step, python_before.pc, inst_str, rust_before, python_before))
|
||||
if len(trace) > trace_len: trace.pop(0)
|
||||
|
||||
if debug: print(f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: PC={python_before.pc}, inst={inst_str}")
|
||||
|
||||
# Instructions with known Rust emulator bugs or precision differences - sync Python to Rust after execution
|
||||
# v_div_scale/v_div_fixup: Rust has different VCC handling
|
||||
# v_cvt_f16_f32: Rust clears high 16 bits, but hardware (and Python) preserves them
|
||||
# s_add_i32/s_sub_i32: Rust has incorrect SCC overflow detection
|
||||
# v_exp_f32/v_log_f32/v_ldexp_f32: precision differences in transcendental functions
|
||||
# s_delay_alu: Rust handles differently
|
||||
# v_add_co_ci_u32/v_sub_co_ci_u32/v_subrev_co_ci_u32: Rust preserves inactive VCC bits, but hardware clears all bits
|
||||
sync_after = any(x in inst_mnemonic.lower() for x in ('v_div_scale', 'v_div_fixup', 'v_cvt_f16_f32', 's_add_i32', 's_sub_i32',
|
||||
'v_exp_f32', 'v_log_f32', 'v_ldexp_f32', 's_delay_alu',
|
||||
'v_add_co_ci_u32', 'v_sub_co_ci_u32', 'v_subrev_co_ci_u32'))
|
||||
# Skip comparison if previous instruction had known Rust bugs (states were synced but may still differ slightly)
|
||||
diffs = rust_before.diff(python_before, n_lanes) if not prev_sync_after else []
|
||||
if diffs:
|
||||
trace_lines = []
|
||||
for idx, (s, pc, d, rb, pb) in enumerate(trace):
|
||||
trace_lines.append(f" step {s}: PC={pc:3d} {d}")
|
||||
if idx < len(trace) - 1:
|
||||
next_rb, next_pb = trace[idx + 1][3:5]
|
||||
rust_diffs = rb.diff(next_rb, n_lanes, "->")
|
||||
python_diffs = pb.diff(next_pb, n_lanes, "->")
|
||||
if rust_diffs: trace_lines.append(f" rust: {', '.join(rust_diffs[:5])}")
|
||||
if python_diffs: trace_lines.append(f" python: {', '.join(python_diffs[:5])}")
|
||||
elif rust_diffs: trace_lines.append(" python: (no changes)")
|
||||
else:
|
||||
# Last traced instruction - compare with current state
|
||||
rust_diffs = rb.diff(rust_before, n_lanes, "->")
|
||||
python_diffs = pb.diff(python_before, n_lanes, "->")
|
||||
if rust_diffs: trace_lines.append(f" rust: {', '.join(rust_diffs[:5])}")
|
||||
if python_diffs: trace_lines.append(f" python: {', '.join(python_diffs[:5])}")
|
||||
elif rust_diffs: trace_lines.append(" python: (no changes)")
|
||||
trace_str = "\n".join(trace_lines)
|
||||
msg = f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step} before inst '{inst_str}': states differ (rust vs python):\n "
|
||||
msg += "\n ".join(diffs[:10]) + f"\n Recent instructions:\n{trace_str}"
|
||||
return False, msg, total_steps
|
||||
|
||||
rust_result = rust.step()
|
||||
python_result = python.step()
|
||||
|
||||
if rust_result != python_result:
|
||||
# Rust returns 1 for unsupported instructions - skip test
|
||||
if rust_result == 1 and python_result == 0:
|
||||
raise unittest.SkipTest(f"Rust emulator doesn't support instruction: {inst_str}")
|
||||
trace_str = "\n".join(f" step {s}: PC={pc:3d} {d}" for s, pc, d, _, _ in trace)
|
||||
msg = (f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: different return codes: "
|
||||
f"rust={rust_result}, python={python_result}, inst={inst_str}\n Recent instructions:\n{trace_str}")
|
||||
return False, msg, total_steps
|
||||
|
||||
# Sync Python state to Rust after instructions with known Rust emulator differences
|
||||
if sync_after:
|
||||
rust_after = rust.get_snapshot()
|
||||
for i in range(128): python.set_sgpr(i, rust_after.sgpr[i])
|
||||
for lane in range(n_lanes):
|
||||
for i in range(256): python.set_vgpr(lane, i, rust_after.vgpr[lane][i])
|
||||
assert python.state is not None
|
||||
# Convert Rust's word-based PC to Python's actual address
|
||||
python.state.pc = python.lib_addr + rust_after.pc * 4
|
||||
python.state._write_sgpr(SCC.offset, rust_after.scc)
|
||||
python.state._write_sgpr(VCC_LO.offset, rust_after.vcc)
|
||||
python.state._write_sgpr(EXEC_LO.offset, rust_after.exec_mask)
|
||||
prev_sync_after = sync_after
|
||||
|
||||
if rust_result == -1:
|
||||
total_steps += step + 1
|
||||
break
|
||||
if rust_result == 1:
|
||||
total_steps += step + 1
|
||||
break
|
||||
if rust_result < 0 and rust_result != -2:
|
||||
return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: error code {rust_result}", total_steps
|
||||
|
||||
step += 1
|
||||
else:
|
||||
return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Max steps ({max_steps}) reached", total_steps
|
||||
finally:
|
||||
rust.free()
|
||||
|
||||
return True, f"Completed {gx*gy*gz} workgroups", total_steps
|
||||
|
||||
def compare_emulators_multi_kernel(kernels: list[KernelSnapshot], buf_pool: dict[int, int], max_steps: int = 1000,
|
||||
debug: bool = False, trace_len: int = 10, buf_data: dict[int, bytes] | None = None) -> tuple[bool, str]:
|
||||
"""Run all kernels through both emulators with shared buffer pool."""
|
||||
if buf_data is None: buf_data = {}
|
||||
|
||||
# Allocate shared buffer pool with padding for over-reads (GPU loads up to 16 bytes at once)
|
||||
buf_id_to_ptr: dict[int, int] = {}
|
||||
buffers = []
|
||||
for buf_id, size in buf_pool.items():
|
||||
padded_size = ((size + 15) // 16) * 16 + 16 # round up to 16 bytes + extra padding
|
||||
# Initialize with data from COPY if available
|
||||
init_data = buf_data.get(buf_id, b'\x00' * padded_size)
|
||||
init_list = list(init_data) + [0] * (padded_size - len(init_data))
|
||||
buf = (ctypes.c_uint8 * padded_size)(*init_list[:padded_size])
|
||||
buffers.append((buf, padded_size))
|
||||
buf_id_to_ptr[buf_id] = ctypes.addressof(buf)
|
||||
|
||||
# Set up valid memory ranges
|
||||
ranges = {(ctypes.addressof(b), size) for b, size in buffers}
|
||||
|
||||
total_steps = 0
|
||||
for ki, kernel in enumerate(kernels):
|
||||
# Create args array for this kernel's buffers
|
||||
args = (ctypes.c_uint64 * len(kernel.buf_idxs))(*[buf_id_to_ptr[bid] for bid in kernel.buf_idxs])
|
||||
args_ptr = ctypes.addressof(args)
|
||||
|
||||
# Update valid ranges to include this args array
|
||||
kernel_ranges = ranges | {(args_ptr, ctypes.sizeof(args))}
|
||||
set_valid_mem_ranges(kernel_ranges)
|
||||
|
||||
n_lanes = kernel.local_size[0] * kernel.local_size[1] * kernel.local_size[2]
|
||||
|
||||
ok, msg, steps = run_single_kernel(
|
||||
kernel.code, min(n_lanes, 32), args_ptr, kernel.global_size,
|
||||
kernel.local_size, max_steps, debug, trace_len, ki
|
||||
)
|
||||
total_steps += steps
|
||||
if not ok:
|
||||
return False, msg
|
||||
|
||||
return True, f"Completed {len(kernels)} kernels, {total_steps} total steps"
|
||||
|
||||
def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list, max_steps: int = 1000, debug: bool = False,
|
||||
global_size: tuple[int, int, int] = (1, 1, 1), trace_len: int = 10) -> tuple[bool, str]:
|
||||
"""Run both emulators with memory set up for tinygrad kernels, executing all workgroups. Legacy wrapper."""
|
||||
# Allocate buffers
|
||||
buffers = []
|
||||
for size in buf_sizes:
|
||||
buf = (ctypes.c_uint8 * size)(*[0] * size)
|
||||
buffers.append(buf)
|
||||
|
||||
# Create args array with buffer pointers
|
||||
args = (ctypes.c_uint64 * len(buffers))(*[ctypes.addressof(b) for b in buffers])
|
||||
args_ptr = ctypes.addressof(args)
|
||||
|
||||
# Set up valid memory ranges for Python emulator
|
||||
ranges = {(ctypes.addressof(b), len(b)) for b in buffers}
|
||||
ranges.add((args_ptr, ctypes.sizeof(args)))
|
||||
set_valid_mem_ranges(ranges)
|
||||
|
||||
# Legacy wrapper assumes local_size = (n_lanes, 1, 1)
|
||||
ok, msg, _ = run_single_kernel(kernel, n_lanes, args_ptr, global_size, (n_lanes, 1, 1), max_steps, debug, trace_len)
|
||||
return ok, msg
|
||||
|
||||
def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelSnapshot], dict[int, int], dict[int, bytes]]:
|
||||
"""Compile a tinygrad operation and extract all kernels with their buffer mappings."""
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
|
||||
out = op_fn(Tensor)
|
||||
sched = out.schedule()
|
||||
kernels = []
|
||||
buf_pool: dict[int, int] = {} # buffer id -> size
|
||||
buf_data: dict[int, bytes] = {} # buffer id -> initial data from COPY
|
||||
|
||||
for ei in sched:
|
||||
lowered = ei.lower()
|
||||
if ei.ast.op.name == 'COPY':
|
||||
# Handle COPY: extract source data to initialize destination buffer
|
||||
if len(lowered.bufs) >= 2:
|
||||
dst_buf, src_buf = lowered.bufs[0], lowered.bufs[1]
|
||||
dst_id = id(dst_buf)
|
||||
if dst_id not in buf_pool:
|
||||
buf_pool[dst_id] = dst_buf.nbytes
|
||||
# Get source data if it's from numpy/CPU
|
||||
if hasattr(src_buf, 'base') and src_buf.base is not None and hasattr(src_buf.base, '_buf'):
|
||||
src_data = bytes(src_buf.base._buf)
|
||||
buf_data[dst_id] = src_data
|
||||
elif ei.ast.op.name == 'SINK':
|
||||
if lowered.prg and lowered.prg.p.lib:
|
||||
lib = bytes(lowered.prg.p.lib)
|
||||
_, sections, _ = elf_loader(lib)
|
||||
for sec in sections:
|
||||
if sec.name == '.text':
|
||||
buf_idxs = []
|
||||
buf_sizes = []
|
||||
for b in lowered.bufs:
|
||||
buf_id = id(b)
|
||||
if buf_id not in buf_pool:
|
||||
buf_pool[buf_id] = b.nbytes
|
||||
buf_idxs.append(buf_id)
|
||||
buf_sizes.append(b.nbytes)
|
||||
kernels.append(KernelSnapshot(
|
||||
code=bytes(sec.content),
|
||||
src=lowered.prg.p.src,
|
||||
global_size=tuple(lowered.prg.p.global_size),
|
||||
local_size=tuple(lowered.prg.p.local_size),
|
||||
buf_idxs=buf_idxs,
|
||||
buf_sizes=buf_sizes
|
||||
))
|
||||
if not kernels: raise RuntimeError("No kernel found")
|
||||
return kernels, buf_pool, buf_data
|
||||
|
||||
def get_kernel_from_tinygrad(op_fn) -> tuple[bytes, tuple[int, int, int], tuple[int, int, int], list]:
|
||||
"""Compile a tinygrad operation and extract the last (main) kernel binary. Legacy wrapper."""
|
||||
kernels, _, _ = get_kernels_from_tinygrad(op_fn)
|
||||
k = kernels[-1]
|
||||
return k.code, k.global_size, k.local_size, k.buf_sizes
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "AMD", "requires AMD device")
|
||||
class TestTinygradKernels(unittest.TestCase):
|
||||
"""Compare emulators on real tinygrad-compiled kernels."""
|
||||
|
||||
def _test_kernel(self, op_fn, max_steps=10000):
|
||||
kernels, buf_pool, buf_data = get_kernels_from_tinygrad(op_fn)
|
||||
ok, msg = compare_emulators_multi_kernel(kernels, buf_pool, max_steps=max_steps, buf_data=buf_data)
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
# Basic ops - consolidated tests covering key instruction patterns
|
||||
def test_unary_ops(self): self._test_kernel(lambda T: T([-1.0, 0.0, 1.0, 2.0]).relu().exp().log().sqrt().reciprocal())
|
||||
def test_binary_ops(self): self._test_kernel(lambda T: (T([1.0, 2.0]) + T([3.0, 4.0])) * T([0.5, 0.5]) - T([1.0, 1.0]))
|
||||
def test_trig(self): self._test_kernel(lambda T: T([0.1, 1.0, 3.14, -1.0]*8).sin() + T([0.1, 1.0, 3.14, -1.0]*8).cos())
|
||||
def test_compare(self): self._test_kernel(lambda T: (T.empty(64) < T.empty(64)).where(T.empty(64), T.empty(64)))
|
||||
def test_bitwise(self): self._test_kernel(lambda T: (T([0xF0, 0x0F, 0xFF]*11).int() & T([0x0F, 0x0F, 0x00]*11).int()) | T([1]*33).int())
|
||||
def test_int_ops(self): self._test_kernel(lambda T: ((T.empty(64).int() + T.empty(64).int()) * T.empty(64).int()).float())
|
||||
|
||||
# Reductions
|
||||
def test_reduce(self): self._test_kernel(lambda T: T.empty(64).sum() + T.empty(64).max())
|
||||
def test_argmax(self): self._test_kernel(lambda T: T.empty(64).argmax())
|
||||
|
||||
# Matmul
|
||||
def test_gemm(self): self._test_kernel(lambda T: T.empty(8, 8) @ T.empty(8, 8), max_steps=100000)
|
||||
@unittest.skip("Rust emulator crashes on this kernel (assertion failure in thread.rs)")
|
||||
def test_gemm_fp16(self): self._test_kernel(lambda T: T.empty(16, 16).half() @ T.empty(16, 16).half(), max_steps=100000)
|
||||
|
||||
# Complex ops
|
||||
def test_softmax(self): self._test_kernel(lambda T: T.empty(16).softmax())
|
||||
def test_layernorm(self): self._test_kernel(lambda T: T.empty(8, 8).layernorm())
|
||||
|
||||
# Memory patterns
|
||||
def test_memory(self): self._test_kernel(lambda T: T.empty(4, 4).permute(1, 0).contiguous() + T.empty(4, 1).expand(4, 4))
|
||||
|
||||
# Cast ops
|
||||
def test_cast(self): self._test_kernel(lambda T: T.empty(32).half().float() + T.empty(32).int().float())
|
||||
|
||||
# Pooling - regression for VCC wave32 mode
|
||||
def test_pool2d(self):
|
||||
self._test_kernel(lambda T: T.empty(1, 1, 8, 8).avg_pool2d(kernel_size=(4,4)) + T.empty(1, 1, 8, 8).max_pool2d(kernel_size=(4,4)))
|
||||
|
||||
# Convolution
|
||||
def test_conv2d(self): self._test_kernel(lambda T: T.empty(1, 2, 8, 8).conv2d(T.empty(2, 2, 3, 3)), max_steps=50000)
|
||||
|
||||
# Regression tests
|
||||
def test_topk(self): self._test_kernel(lambda T: T.empty(64).topk(3)[0])
|
||||
def test_interpolate(self): self._test_kernel(lambda T: T.empty(1,2,16,16).relu().cast('uint8').interpolate((8,8), mode="linear"))
|
||||
def test_index_int64(self):
|
||||
from tinygrad import dtypes
|
||||
self._test_kernel(lambda T: T.empty(4, 4)[T.arange(4).cast(dtypes.int64), :])
|
||||
def test_gelu(self): self._test_kernel(lambda T: T.empty(32, 32).gelu())
|
||||
def test_exp(self): self._test_kernel(lambda T: T.empty(1024).exp())
|
||||
def test_cross_entropy(self):
|
||||
import numpy as np
|
||||
np.random.seed(0)
|
||||
classes = np.random.randint(0, 10, (16,), dtype=np.int32).tolist()
|
||||
x_np = np.random.randn(16, 10).astype(np.float32)
|
||||
self._test_kernel(lambda T: (T(x_np.tolist()).reshape(16,10) + 0).cross_entropy((T(classes).int().reshape(16) + 0)))
|
||||
def test_isinf(self): self._test_kernel(lambda T: T([float('-inf'), 0., float('inf'), 1.1]*8).isinf())
|
||||
def test_sin_f64(self):
|
||||
from tinygrad import dtypes
|
||||
self._test_kernel(lambda T: T([2.0], dtype=dtypes.float64).sin())
|
||||
|
||||
def test_sin_large_f32(self):
|
||||
"""Test sin with large values that trigger Payne-Hanek range reduction."""
|
||||
# Values around 859240 trigger the Payne-Hanek algorithm
|
||||
# This tests the integer multiply-high instructions used in range reduction
|
||||
self._test_kernel(lambda T: T([859240.0, 1000000.0, 100594688.0]).sin())
|
||||
|
||||
def test_clip_zero_one(self):
|
||||
"""Test clip(0, 1) - regression for binary_crossentropy failure."""
|
||||
import numpy as np
|
||||
np.random.seed(0)
|
||||
x_np = np.random.uniform(-2, 2, (32, 10)).astype(np.float32).tolist()
|
||||
self._test_kernel(lambda T: T(x_np).clip(0, 1))
|
||||
|
||||
def test_mod_int64(self):
|
||||
"""Test int64 modulo, especially edge cases like 1 % -1."""
|
||||
from tinygrad import dtypes
|
||||
self._test_kernel(lambda T: T([1, 10, -10, 7], dtype=dtypes.int64) % T([-1, 3, 3, -3], dtype=dtypes.int64))
|
||||
|
||||
def test_expand_flatten_sum(self):
|
||||
"""Test flatten of expanded tensor followed by sum.
|
||||
|
||||
Bug: flatten() of an expanded tensor produces wrong results for certain sizes.
|
||||
Sizes that are multiples of 32 work (32, 48, 64), but sizes like 33, 49, 50 fail.
|
||||
This breaks masked_select and nonzero operations.
|
||||
"""
|
||||
import numpy as np
|
||||
np.random.seed(0)
|
||||
x_np = np.random.uniform(-2, 2, (33,)).astype(np.float32)
|
||||
self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).unsqueeze(-1).expand(33, 3).flatten().sum())
|
||||
|
||||
@unittest.skip("slow and broken with AMD:LLVM")
|
||||
def test_nonzero(self):
|
||||
"""Test nonzero operation - counts and gathers indices of non-zero elements."""
|
||||
import numpy as np
|
||||
np.random.seed(42)
|
||||
x_np = np.random.rand(10, 5, 3).astype(np.float32)
|
||||
self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).nonzero())
|
||||
|
||||
@unittest.skip("Precision differences in v_exp/v_log accumulate across kernels, causing memory divergence")
|
||||
def test_softmax_argmax_fused(self):
|
||||
"""Test fused softmax+argmax - tracks exp2 precision issue.
|
||||
|
||||
The fused kernel recomputes softmax inline and Python emulator's exp2 polynomial
|
||||
has up to 1 ULP error vs native exp2f, causing accumulated differences.
|
||||
"""
|
||||
import torch
|
||||
torch.manual_seed(0)
|
||||
x_np = torch.rand(4, 10).numpy()
|
||||
self._test_kernel(lambda T: T(x_np.tolist()).softmax(1).argmax())
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
@ -39,7 +39,6 @@ dev.synchronize()
|
|||
env = os.environ.copy()
|
||||
env["AMD"] = "1"
|
||||
env["MOCKGPU"] = "1"
|
||||
env["PYTHON_REMU"] = "1"
|
||||
env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"
|
||||
|
||||
st = time.perf_counter()
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match."""
|
||||
import unittest, io, sys, re
|
||||
from dataclasses import dataclass
|
||||
from tinygrad import Device
|
||||
from tinygrad.renderer.amd import detect_format
|
||||
from test.amd.helpers import llvm_assemble, llvm_disasm, get_target, get_mattr
|
||||
|
|
@ -44,6 +45,64 @@ def compile_and_disasm_batch(instrs: list[str], arch: str = 'rdna3') -> list[str
|
|||
code = b''.join(llvm_assemble(instrs, mcpu, mattr))
|
||||
return llvm_disasm(code, mcpu, mattr)[:len(instrs)]
|
||||
|
||||
@dataclass
|
||||
class KernelSnapshot:
|
||||
code: bytes
|
||||
src: str
|
||||
global_size: tuple[int, int, int]
|
||||
local_size: tuple[int, int, int]
|
||||
buf_idxs: list[int] # indices into shared buffer pool
|
||||
buf_sizes: list[int] # sizes for each buffer index
|
||||
|
||||
def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelSnapshot], dict[int, int], dict[int, bytes]]:
|
||||
"""Compile a tinygrad operation and extract all kernels with their buffer mappings."""
|
||||
from tinygrad import Tensor
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
|
||||
out = op_fn(Tensor)
|
||||
sched = out.schedule()
|
||||
kernels = []
|
||||
buf_pool: dict[int, int] = {} # buffer id -> size
|
||||
buf_data: dict[int, bytes] = {} # buffer id -> initial data from COPY
|
||||
|
||||
for ei in sched:
|
||||
lowered = ei.lower()
|
||||
if ei.ast.op.name == 'COPY':
|
||||
# Handle COPY: extract source data to initialize destination buffer
|
||||
if len(lowered.bufs) >= 2:
|
||||
dst_buf, src_buf = lowered.bufs[0], lowered.bufs[1]
|
||||
dst_id = id(dst_buf)
|
||||
if dst_id not in buf_pool:
|
||||
buf_pool[dst_id] = dst_buf.nbytes
|
||||
# Get source data if it's from numpy/CPU
|
||||
if hasattr(src_buf, 'base') and src_buf.base is not None and hasattr(src_buf.base, '_buf'):
|
||||
src_data = bytes(src_buf.base._buf)
|
||||
buf_data[dst_id] = src_data
|
||||
elif ei.ast.op.name == 'SINK':
|
||||
if lowered.prg and lowered.prg.p.lib:
|
||||
lib = bytes(lowered.prg.p.lib)
|
||||
_, sections, _ = elf_loader(lib)
|
||||
for sec in sections:
|
||||
if sec.name == '.text':
|
||||
buf_idxs = []
|
||||
buf_sizes = []
|
||||
for b in lowered.bufs:
|
||||
buf_id = id(b)
|
||||
if buf_id not in buf_pool:
|
||||
buf_pool[buf_id] = b.nbytes
|
||||
buf_idxs.append(buf_id)
|
||||
buf_sizes.append(b.nbytes)
|
||||
kernels.append(KernelSnapshot(
|
||||
code=bytes(sec.content),
|
||||
src=lowered.prg.p.src,
|
||||
global_size=tuple(lowered.prg.p.global_size),
|
||||
local_size=tuple(lowered.prg.p.local_size),
|
||||
buf_idxs=buf_idxs,
|
||||
buf_sizes=buf_sizes
|
||||
))
|
||||
if not kernels: raise RuntimeError("No kernel found")
|
||||
return kernels, buf_pool, buf_data
|
||||
|
||||
@unittest.skipUnless(Device.DEFAULT == "AMD", "requires AMD device")
|
||||
class TestTinygradKernelRoundtrip(unittest.TestCase):
|
||||
"""Test roundtrip on real tinygrad-generated kernels using get_kernels_from_tinygrad pattern."""
|
||||
|
|
@ -57,7 +116,6 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
|
|||
"""
|
||||
arch = self.arch
|
||||
|
||||
from test.amd.test_compare_emulators import get_kernels_from_tinygrad
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
|
||||
from tinygrad.helpers import DEV
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ Test with `pytest -n12 test/amd/`
|
|||
`DEV=AMD:LLVM pytest -n12 test/amd/`
|
||||
|
||||
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
|
||||
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1 PYTHON_REMU=1`
|
||||
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1`
|
||||
* generate.py -- extract assembly format + instruction pseudocode from AMD XML + PDF
|
||||
* test/mockgpu/amd/pcode.py -- pseudocode to UOp transformation
|
||||
* sqtt.py -- SQTT parser
|
||||
|
|
@ -20,20 +20,19 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
|
|||
|
||||
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
|
||||
|
||||
`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
|
||||
The ops tests also pass, but they are very slow, so you should run them one at a time.
|
||||
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
|
||||
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
|
||||
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
|
||||
If a test is failing with `DEV=AMD PYTHON_REMU=1 MOCKGPU=1` it's because an instruction is emulated incorrectly.
|
||||
If a test is failing with `DEV=AMD MOCKGPU=1` it's because an instruction is emulated incorrectly.
|
||||
You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
|
||||
IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.
|
||||
|
||||
Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.
|
||||
Get line count with `cloc --by-file tinygrad/renderer/amd/*.py`
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import ctypes, time
|
||||
from dataclasses import replace
|
||||
from test.mockgpu.gpu import VirtGPU
|
||||
from test.mockgpu.helpers import _try_dlopen_remu
|
||||
from test.mockgpu.helpers import PythonRemu
|
||||
from tinygrad.helpers import getbits, to_mv, getenv, DEV
|
||||
from tinygrad.runtime.support import c
|
||||
|
||||
|
|
@ -41,7 +41,7 @@ WAIT_REG_MEM_FUNCTION_EQ = 3 # ==
|
|||
WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
|
||||
WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=
|
||||
|
||||
remu = _try_dlopen_remu()
|
||||
remu = PythonRemu()
|
||||
|
||||
def create_sdma_packets():
|
||||
# TODO: clean up this, if we want to keep it
|
||||
|
|
@ -212,13 +212,13 @@ class PM4Executor(AMDQueue):
|
|||
scratch_size = wavesize * (16 if self.gpu.arch == "cdna" else 4) # per-thread scratch size in bytes
|
||||
|
||||
assert prg_sz > 0, "Invalid prg ptr (not found in mapped ranges)"
|
||||
# Pass valid memory ranges, rsrc2, scratch_size, arch, and user data registers to Python emulator
|
||||
if hasattr(remu, 'valid_mem_ranges'): remu.valid_mem_ranges = self.gpu.mapped_ranges
|
||||
if hasattr(remu, 'rsrc2'): remu.rsrc2 = rsrc2
|
||||
if hasattr(remu, 'scratch_size'): remu.scratch_size = scratch_size
|
||||
if hasattr(remu, 'arch'): remu.arch = self.gpu.arch
|
||||
if hasattr(remu, 'user_data'): remu.user_data = user_data
|
||||
err = remu.run_asm(prg_addr, prg_sz, *gl, *lc, args_addr)
|
||||
# Pass valid memory ranges, rsrc2, scratch_size, arch, and user data registers to the emulator
|
||||
remu.valid_mem_ranges = self.gpu.mapped_ranges
|
||||
remu.rsrc2 = rsrc2
|
||||
remu.scratch_size = scratch_size
|
||||
remu.arch = self.gpu.arch
|
||||
remu.user_data = user_data
|
||||
err = remu.run_asm(prg_addr, prg_sz, gl[0], gl[1], gl[2], lc[0], lc[1], lc[2], args_addr)
|
||||
if err != 0: raise RuntimeError("remu does not support the new instruction introduced in this kernel")
|
||||
|
||||
def _exec_indirect_buffer(self, n):
|
||||
|
|
|
|||
|
|
@ -233,7 +233,6 @@ VOPD_TO_VOP2 = {
|
|||
ir4.VOPDOp.V_DUAL_DOT2ACC_F32_F16: ir3.VOP2Op.V_DOT2ACC_F32_F16_E32,
|
||||
}
|
||||
def _wave_size(arch: str) -> int: return 64 if arch.startswith("cdna") else 32
|
||||
WAVE_SIZE = 32 # default wave size for RDNA (exported for test_compare_emulators)
|
||||
# Special registers stored after inline constants (256-259)
|
||||
PC_LO_IDX, PC_HI_IDX, SCRATCH_STRIDE_IDX = 256, 257, 259
|
||||
# SGPR buffer: 0-127 = SGPRs, 128-255 = inline constants, 256-259 = special registers
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
import ctypes, ctypes.util
|
||||
from tinygrad.helpers import getenv
|
||||
|
||||
def _try_dlopen_gpuocelot():
|
||||
GPUOCELOT_PATHS = [ctypes.util.find_library("gpuocelot")] if ctypes.util.find_library("gpuocelot") is not None else []
|
||||
|
|
@ -16,7 +15,7 @@ def _try_dlopen_gpuocelot():
|
|||
return None
|
||||
|
||||
class PythonRemu:
|
||||
"""Python RDNA3/RDNA4 emulator wrapper that matches the libremu.so interface."""
|
||||
"""Python RDNA3/RDNA4 emulator wrapper used by mockgpu."""
|
||||
valid_mem_ranges: set[tuple[int, int]] = set()
|
||||
rsrc2: int = 0x19c # Default: USER_SGPR_COUNT=14, enable X and Y workgroup IDs
|
||||
scratch_size: int = 0 # private_segment_fixed_size from kernel descriptor
|
||||
|
|
@ -26,20 +25,3 @@ class PythonRemu:
|
|||
def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int:
|
||||
from test.mockgpu.amd.emu import run_asm
|
||||
return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2, self.scratch_size, self.arch, self.user_data)
|
||||
|
||||
def _try_dlopen_remu():
|
||||
# Use Python emulator only if PYTHON_REMU=1
|
||||
if int(getenv("PYTHON_REMU", "1")):
|
||||
return PythonRemu()
|
||||
REMU_PATHS = ["extra/remu/target/release/libremu.so", "libremu.so", "/usr/local/lib/libremu.so",
|
||||
"extra/remu/target/release/libremu.dylib", "libremu.dylib", "/usr/local/lib/libremu.dylib", "/opt/homebrew/lib/libremu.dylib"]
|
||||
for path in REMU_PATHS:
|
||||
try:
|
||||
remu = ctypes.CDLL(path)
|
||||
remu.run_asm.restype = ctypes.c_int32
|
||||
remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32,
|
||||
ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p]
|
||||
except OSError: pass
|
||||
else: return remu
|
||||
print("Could not find libremu.so")
|
||||
return None
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue