remove REMU from tree (#15706)

* no more compare emulators * remove remu from tree
2026-06-24 02:14:17 +00:00 · 2026-04-13 20:43:08 +08:00 · 2026-04-13 20:43:08 +08:00 · 16f50a40a5
commit 16f50a40a5
parent ac027055ef
22 changed files with 77 additions and 5732 deletions
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@ -225,14 +225,12 @@ runs:
      if: inputs.amd == 'true' && runner.os == 'Linux'
      shell: bash
      run: |
-        cargo build --release --manifest-path ./extra/remu/Cargo.toml
-        sudo ln -sf ${{ github.workspace }}/extra/remu/target/release/libremu.so /usr/local/lib/libremu.so
        sudo tee --append /etc/ld.so.conf.d/rocm.conf <<'EOF'
          /opt/rocm/lib
          /opt/rocm/lib64
        EOF
        sudo ldconfig
-    - name: Setup AMD comgr+remu (macOS)
+    - name: Setup AMD comgr (macOS)
      if: inputs.amd == 'true' && runner.os == 'macOS'
      shell: bash
      run: |
@ -240,7 +238,6 @@ runs:
        curl -s -H "Authorization: token $GH_TOKEN" curl -s https://api.github.com/repos/tinygrad/amdcomgr_dylib/releases/latest | \
          jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
          sudo xargs curl -fL -o /usr/local/lib/libamd_comgr.dylib
-        cargo build --release --manifest-path ./extra/remu/Cargo.toml

    # **** gpuocelot ****

--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -71,10 +71,6 @@ jobs:
        uv venv /tmp/tinygrad_pytest_ci
        source /tmp/tinygrad_pytest_ci/bin/activate
        uv pip install .[testing]
-    - name: setup other stuff
-      run: |
-        mkdir -p extra/remu/target/release/
-        ln -s ~/tinygrad/extra/remu/target/release/libremu.so extra/remu/target/release/libremu.so
    - name: setup staging db
      run: |
        echo "CACHEDB=/tmp/pytest-db-ci.db" >> $GITHUB_ENV
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -644,7 +644,6 @@ jobs:
    timeout-minutes: 20
    env:
      DEV: AMD
-      PYTHON_REMU: 1
      MOCKGPU: 1
    steps:
      - name: Checkout Code
--- a/extra/remu/Cargo.lock
+++ b/extra/remu/Cargo.lock
@ -1,66 +0,0 @@
-# This file is automatically @generated by Cargo.
-# It is not intended for manual editing.
-version = 4
-
-[[package]]
-name = "autocfg"
-version = "1.1.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
-
-[[package]]
-name = "cfg-if"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
-
-[[package]]
-name = "crunchy"
-version = "0.2.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
-
-[[package]]
-name = "float-cmp"
-version = "0.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98de4bbd547a563b716d8dfa9aad1cb19bfab00f4fa09a6a4ed21dbcf44ce9c4"
-dependencies = [
- "num-traits",
-]
-
-[[package]]
-name = "half"
-version = "2.3.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc52e53916c08643f1b56ec082790d1e86a32e58dc5268f897f313fbae7b4872"
-dependencies = [
- "cfg-if",
- "crunchy",
- "num-traits",
-]
-
-[[package]]
-name = "libm"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
-
-[[package]]
-name = "num-traits"
-version = "0.2.17"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "39e3200413f237f41ab11ad6d161bc7239c84dcb631773ccd7de3dfe4b5c267c"
-dependencies = [
- "autocfg",
- "libm",
-]
-
-[[package]]
-name = "remu"
-version = "0.1.0"
-dependencies = [
- "float-cmp",
- "half",
- "num-traits",
-]
--- a/extra/remu/Cargo.toml
+++ b/extra/remu/Cargo.toml
@ -1,15 +0,0 @@
-[package]
-name = "remu"
-version = "0.1.0"
-edition = "2021"
-rust-version = "1.80.0"
-
-[lib]
-crate-type = ["cdylib"]
-
-[dependencies]
-half = { version = "2.3.1", features = ["num-traits"] }
-num-traits = "0.2.17"
-
-[dev-dependencies]
-float-cmp = "0.9.0"
--- a/extra/remu/README.md
+++ b/extra/remu/README.md
@ -1,80 +0,0 @@
-## Intro
-
-Remu is an RDNA3 emulator built to test correctness of RDNA3 code. It is used in [tinygrad's AMD CI](https://github.com/tinygrad/tinygrad).
-
-Most of the common instructions are implemented, but some formats like IMG are not supported.
-
-Remu is only for testing correctness of program output, it is not a cycle accurate simulator.
-
-## Build Locally
-
-Remu is written in Rust. Make sure you have [Cargo](https://doc.rust-lang.org/cargo/getting-started/installation.html).
-
-To build the project, run:
-
-```bash
-cargo build --release --manifest-path ./extra/remu/Cargo.toml
-```
-
-This will produce a binary in the `extra/remu/target/release` directory.
-
-## Usage with tinygrad
-
-The latest binaries are released in https://github.com/Qazalin/remu/releases. Alternatively, you can [build locally](#build-locally).
-
-Tinygrad does not yet output RDNA3 kernels directly. You can either install comgr or use `DEV=AMD:LLVM` (default) if you have [LLVM@19](https://github.com/tinygrad/tinygrad/blob/e2ed673c946c8f1774d816c75e52a994c2dd8a88/.github/actions/setup-tinygrad/action.yml#L208).
-
-`PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_tiny.py TestTiny.test_plus` runs an emulated RDNA3 kernel with Remu.
-
-Add `DEBUG=6` to see Remu's logs.
-
-### DEBUG output
-
-Remu runs each thread one at a time in a nested for loop, see lib.rs. The DEBUG output prints information about the current thread.
-
-The DEBUG output has 3 sections:
-
-```
-<------------ 1 ----------> <--- 2 ---> <--------------------------------------- 3 ------------------------------------------>
-[0   0   0  ] [0   0   0  ] 0  F4080100 SMEM { op: 2, sdata: 4, sbase: 0, offset: 0, soffset: 124, glc: false, dlc: false }
-```
-
-#### Section 1: Grid info
-
-`[gid.x, gid.y, gid.z], [lid.x, lid.y, lid.z]` of the current thread.
-
-#### Section 2: Wave info
-
-`<lane> <instruction hex>`
-
-RDNA3 divides threads into chunks of 32. Each thread is assigned to a "lane" from 0-31.
-
-In Remu, even though all threads run one at a time, each 32 thread chunk (a wave) shares state like SGPR, VGPR, LDS, EXEC mask, etc.
-Remu can simulate up to one wave sync instruction.
-For more details, see work_group.rs.
-
-Section 2 can have a green or gray color.
-
-Green = The thread is actively executing the instruction.
-
-Gray = The thread has been "turned off" by the EXEC mask, it skips execution of some instructions. (refer to "EXECute Mask" on [page 23](https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna3-shader-instruction-set-architecture-feb-2023_0.pdf#page=23) of ISA docs for more details.)
-
-To see the colors in action, try running `DEBUG=6 PYTHONPATH="." MOCKGPU=1 DEV=AMD python test/test_ops.py TestOps.test_arange_big`. See how only lane 0 writes to global memory:
-```
-[255 0   0  ] [0   0   0  ] 0  DC6A0000 FLAT { op: 26, offset: 0, dlc: false, glc: false, slc: false, seg: 2, addr: 8, data: 0, saddr: 0, sve: false, vdst: 0 }
-[255 0   0  ] [1   0   0  ] 1  DC6A0000
-[255 0   0  ] [2   0   0  ] 2  DC6A0000
-[255 0   0  ] [3   0   0  ] 3  DC6A0000
-[255 0   0  ] [3   0   0  ] 4  DC6A0000
-```
-
-#### Section 3: Decoded Instruction
-
-This prints the instruction type and all the parsed bitfields.
-
-Remu output vs llvm-objdump:
-
-```
-s_load_b64 s[0:1], s[0:1], 0x10                            // 00000000160C: F4040000 F8000010
-SMEM { op: 1, sdata: 0, sbase: 0, offset: 16, soffset: 124, glc: false, dlc: false }
-```
--- a/extra/remu/rustfmt.toml
+++ b/extra/remu/rustfmt.toml
@ -1 +0,0 @@
-max_width = 150
--- a/extra/remu/src/helpers.rs
+++ b/extra/remu/src/helpers.rs
@ -1,162 +0,0 @@
-use half::f16;
-use num_traits::{float::FloatCore, PrimInt, Unsigned, clamp};
-
-pub fn bits<T>(word: T, hi: usize, lo: usize) -> T where T: PrimInt + Unsigned {
-    assert!(hi >= lo);
-    let width = hi - lo + 1;
-    (word >> lo) & ((T::one() << width) - T::one())
-}
-
-pub fn nth(val: u32, pos: usize) -> u32 {
-    (val >> (31 - pos as u32)) & 1
-}
-pub fn f16_lo(val: u32) -> f16 {
-    f16::from_bits((val & 0xffff) as u16)
-}
-pub fn f16_hi(val: u32) -> f16 {
-    f16::from_bits(((val >> 16) & 0xffff) as u16)
-}
-
-pub fn sign_ext(num: u64, bits: usize) -> i64 {
-    let mut value = num;
-    let is_negative = (value >> (bits - 1)) & 1 != 0;
-    if is_negative {
-        value |= !0 << bits;
-    }
-    value as i64
-}
-
-pub trait IEEEClass<T> {
-    fn exponent(&self) -> T;
-}
-impl IEEEClass<u32> for f32 {
-    fn exponent(&self) -> u32 {
-        (self.to_bits() & 0b01111111100000000000000000000000) >> 23
-    }
-}
-impl IEEEClass<u16> for f16 {
-    fn exponent(&self) -> u16 {
-        (self.to_bits() & 0b0111110000000000) >> 10
-    }
-}
-impl IEEEClass<u64> for f64 {
-    fn exponent(&self) -> u64 {
-        (self.to_bits() & 0b0111111111110000000000000000000000000000000000000000000000000000) >> 52
-    }
-}
-
-pub trait VOPModifier<T> {
-    fn negate(&self, pos: usize, modifier: usize) -> T;
-    fn absolute(&self, pos: usize, modifier: usize) -> T;
-    fn clmp(&self, cm: bool) -> T;
-}
-impl<T> VOPModifier<T> for T
-where
-    T: FloatCore,
-{
-    fn negate(&self, pos: usize, modifier: usize) -> T {
-        match (modifier >> pos) & 1 {
-            1 => -*self,
-            _ => *self,
-        }
-    }
-    fn absolute(&self, pos: usize, modifier: usize) -> T {
-        match (modifier >> pos) & 1 {
-            1 => self.abs(),
-            _ => *self,
-        }
-    }
-    fn clmp(&self, cm:bool) -> T {
-        if !cm { return *self }
-        let r = clamp(*self, T::zero(), T::one());
-        if r == T::zero() { T::zero() } else { r }
-    }
-}
-
-pub fn extract_mantissa(x: f64) -> f64 {
-    if x.is_infinite() || x.is_nan() {
-        return x;
-    }
-    let bits = x.to_bits();
-    let mantissa_mask: u64 = 0x000FFFFFFFFFFFFF;
-    let bias: u64 = 1023;
-    let normalized_mantissa_bits = (bits & mantissa_mask) | ((bias - 1) << 52);
-    return f64::from_bits(normalized_mantissa_bits);
-}
-pub fn ldexp(x: f64, exp: i32) -> f64 {
-    x * 2f64.powi(exp)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-    #[test]
-    fn test_extract_mantissa() {
-        assert_eq!(extract_mantissa(2.0f64), 0.5);
-    }
-
-    #[test]
-    fn test_normal_exponent() {
-        assert_eq!(2.5f32.exponent(), 128);
-        assert_eq!(1.17549435e-38f32.exponent(), 1);
-        assert_eq!(f32::INFINITY.exponent(), 255);
-        assert_eq!(f32::NEG_INFINITY.exponent(), 255);
-    }
-
-    #[test]
-    fn test_denormal_exponent() {
-        assert_eq!(1.0e-40f32.exponent(), 0);
-        assert_eq!(1.0e-42f32.exponent(), 0);
-        assert_eq!(1.0e-44f32.exponent(), 0);
-        assert_eq!((1.17549435e-38f32 / 2.0).exponent(), 0);
-    }
-
-    #[test]
-    fn test_normal_exponent_f16() {
-        assert_eq!(f16::from_f32(3.14f32).exponent(), 16);
-        assert_eq!(f16::NEG_INFINITY.exponent(), 31);
-        assert_eq!(f16::INFINITY.exponent(), 31);
-    }
-
-    #[test]
-    fn test_neg() {
-        assert_eq!(0.3_f32.negate(0, 0b001), -0.3_f32);
-        assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
-        assert_eq!(0.3_f32.negate(2, 0b100), -0.3_f32);
-        assert_eq!(0.3_f32.negate(0, 0b110), 0.3_f32);
-        assert_eq!(0.3_f32.negate(1, 0b010), -0.3_f32);
-        assert_eq!(0.0_f32.negate(0, 0b001).to_bits(), (-0.0f32).to_bits());
-        assert_eq!((-0.0_f32).negate(0, 0b001).to_bits(), 0);
-    }
-
-    #[test]
-    fn test_sign_ext() {
-        assert_eq!(sign_ext(0b000000000000000101000, 21), 40);
-        assert_eq!(sign_ext(0b111111111111111011000, 21), -40);
-        assert_eq!(sign_ext(0b000000000000000000000, 21), 0);
-        assert_eq!(sign_ext(0b111111111111111111111, 21), -1);
-        assert_eq!(sign_ext(0b111000000000000000000, 21), -262144);
-        assert_eq!(sign_ext(0b000111111111111111111, 21), 262143);
-        assert_eq!(sign_ext(7608, 13), -584);
-    }
-}
-
-use std::sync::LazyLock;
-pub static DEBUG: LazyLock<bool> = LazyLock::new(|| std::env::var("DEBUG").map(|v| v.parse::<usize>().unwrap_or(0) >= 6).unwrap_or(false));
-
-pub fn colored(st:&str, color:&str) -> String {
-    let ansi_code = match color {
-        "green" => format!("\x1b[{};2;39;176;139m", 38),
-        "gray" => format!("\x1b[{};2;169;169;169m", 38),
-        _ => format!("\x1b[{};2;255;255;255m", 38),
-    };
-    format!("{}{}{}", ansi_code, st, "\x1b[0m")
-}
-
-#[macro_export]
-macro_rules! todo_instr {
-    ($x:expr) => {{
-        println!("{:08X}", $x);
-        Err(1)
-    }};
-}
--- a/extra/remu/src/lib.rs
+++ b/extra/remu/src/lib.rs
@ -1,77 +0,0 @@
-use crate::state::StateSnapshot;
-use crate::work_group::{WaveContext, WorkGroup};
-use std::os::raw::c_char;
-use std::slice;
-mod helpers;
-mod rdna3;
-mod state;
-mod thread;
-mod work_group;
-
-#[no_mangle]
-pub extern "C" fn run_asm(lib: *const c_char, lib_sz: u32, gx: u32, gy: u32, gz: u32, lx: u32, ly: u32, lz: u32, args_ptr: *const u64) -> i32 {
-    if lib.is_null() || (lib_sz % 4) != 0 {
-        panic!("Pointer is null or length is not properly aligned to 4 bytes");
-    }
-    let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() };
-    let dispatch_dim = match (gy != 1, gz != 1) {
-        (true, true) => 3,
-        (true, false) => 2,
-        _ => 1,
-    };
-    for gx in 0..gx {
-        for gy in 0..gy {
-            for gz in 0..gz {
-                let mut wg = WorkGroup::new(dispatch_dim, [gx, gy, gz], [lx, ly, lz], &kernel, args_ptr);
-                if let Err(err) = wg.exec_waves() {
-                    return err;
-                }
-            }
-        }
-    }
-    0
-}
-
-// FFI functions for single-stepping comparison tests
-
-#[no_mangle]
-pub extern "C" fn wave_create(lib: *const c_char, lib_sz: u32, n_lanes: u32) -> *mut WaveContext {
-    if lib.is_null() || (lib_sz % 4) != 0 { return std::ptr::null_mut(); }
-    let kernel = unsafe { slice::from_raw_parts(lib as *const u32, (lib_sz / 4) as usize).to_vec() };
-    Box::into_raw(Box::new(WaveContext::new(kernel, n_lanes as usize)))
-}
-
-#[no_mangle]
-pub extern "C" fn wave_step(ctx: *mut WaveContext) -> i32 {
-    if ctx.is_null() { return -99; }
-    unsafe { (*ctx).step() }
-}
-
-#[no_mangle]
-pub extern "C" fn wave_get_snapshot(ctx: *const WaveContext, out: *mut StateSnapshot) {
-    if ctx.is_null() || out.is_null() { return; }
-    unsafe { *out = (*ctx).get_snapshot(); }
-}
-
-#[no_mangle]
-pub extern "C" fn wave_set_sgpr(ctx: *mut WaveContext, idx: u32, val: u32) {
-    if ctx.is_null() || idx >= 128 { return; }
-    unsafe { (*ctx).scalar_reg[idx as usize] = val; }
-}
-
-#[no_mangle]
-pub extern "C" fn wave_set_vgpr(ctx: *mut WaveContext, lane: u32, idx: u32, val: u32) {
-    if ctx.is_null() || lane >= 32 || idx >= 256 { return; }
-    unsafe { (*ctx).vec_reg.get_lane_mut(lane as usize)[idx as usize] = val; }
-}
-
-#[no_mangle]
-pub extern "C" fn wave_init_lds(ctx: *mut WaveContext, size: u32) {
-    if ctx.is_null() { return; }
-    unsafe { (*ctx).lds.data.resize(size as usize, 0); }
-}
-
-#[no_mangle]
-pub extern "C" fn wave_free(ctx: *mut WaveContext) {
-    if !ctx.is_null() { unsafe { drop(Box::from_raw(ctx)); } }
-}
--- a/extra/remu/src/rdna3.rs
+++ b/extra/remu/src/rdna3.rs
@ -1,223 +0,0 @@
-use crate::helpers::{bits, sign_ext};
-
-#[derive(Debug, PartialEq)]
-pub enum Instruction {
-    SOP2 { op: u8, ssrc0: u8, ssrc1: u8, sdst: u8 },
-    SOP1 { op: u8, ssrc0: u8, sdst: u8 },
-    SOPK { op: u8, simm16: i16, sdst: u8 },
-    SOPP { op: u8, simm16: i16 },
-    SOPC { op: u8, ssrc0: u8, ssrc1: u8 },
-
-    SMEM { op: u8, sdata: u8, sbase: u8, offset: i32, soffset: u8, glc: bool, dlc: bool },
-
-    VOP1 { op: u8, vdst: u8, src: u16 },
-    VOP2 { op: u8, vdst: u8, vsrc: u8, src: u16 },
-    VOPC { op: u8, vsrc: u8, src: u16 },
-    VOP3 { op: u32, opsel: u8, cm: bool, abs: u8, vdst: u8, neg: u8, omod: u8, src2: u16, src1: u16, src0: u16 },
-    VOP3SD { op: u32, cm: bool, sdst: u8, vdst: u8, neg: u8, omod: u8, src2: u16, src1: u16, src0: u16 },
-    VOP3P { op: u8, vdst: u8, neg_hi: u8, opsel: u8, opsel_hi: u8, opsel_hi2: bool, cm: bool, src2: u16, src1: u16, src0: u16, neg: u8 },
-    VOPD { opx: u8, opy: u8, vdstx: u8, vdsty: u8, vsrcx1: u8, vsrcy1: u8, srcx0: u16, srcy0: u16 },
-
-    DS { op: u8, gds: bool, offset1: u8, offset0: u8, vdst: u8, data1: u8, data0: u8, addr: u8 },
-
-    FLAT { op: u8, offset: u16, dlc: bool, glc: bool, slc: bool, seg: u8, addr: u8, data: u8, saddr: u8, sve: bool, vdst: u8 }
-}
-
-const VOP3SD_OPS: [u32; 7] = [764, 765, 766, 767, 768, 769, 770];
-
-pub fn decode(word:u32, word1:Option<&u32>) -> Instruction {
-    match bits(word, 31, 30) {
-        0b11 => {
-            let word = (*word1.unwrap() as u64) << 32 | (word as u64);
-            match bits(word, 29, 26) {
-                0b1101 => {
-                    let sbase = (bits(word, 5, 0) as u8) << 1;
-                    let sdata = bits(word, 12, 6) as u8;
-                    let dlc = bits(word, 13, 13) != 0;
-                    let glc = bits(word, 14, 14) != 0;
-                    let op = bits(word, 25, 18) as u8;
-                    let offset = sign_ext(bits(word, 52, 32), 21) as i32;
-                    let soffset = bits(word, 63, 57) as u8;
-                    Instruction::SMEM { sbase, sdata, dlc, glc, op, offset, soffset }
-                }
-                0b0101 => {
-                    let op = bits(word, 25, 16) as u32;
-                    let vdst = bits(word, 7, 0) as u8;
-                    let cm = bits(word, 15, 15) != 0;
-                    let src0 = bits(word, 40, 32) as u16;
-                    let src1 = bits(word, 49, 41) as u16;
-                    let src2 = bits(word, 58, 50) as u16;
-                    let omod = bits(word, 60, 59) as u8;
-                    let neg = bits(word, 63, 61) as u8;
-                     if VOP3SD_OPS.contains(&op) {
-                         let sdst = bits(word, 14, 8) as u8;
-                         Instruction::VOP3SD { op, vdst, sdst, cm, src0, src1, src2, omod, neg }
-                     } else {
-                        let abs = bits(word, 10, 8) as u8;
-                        let opsel = bits(word, 14, 11) as u8;
-                        Instruction::VOP3 { opsel, cm, abs, vdst, neg, omod, src2, src1, src0, op }
-                     }
-                }
-                0b0011 => {
-                    let op = bits(word, 22, 16) as u8;
-                    let vdst = bits(word, 7, 0) as u8;
-                    let neg_hi = bits(word, 10, 8) as u8;
-                    let opsel = bits(word, 13, 11) as u8;
-                    let opsel_hi2 = bits(word, 14, 14) != 0;
-                    let cm = bits(word, 15, 15) != 0;
-                    let src0 = bits(word, 40, 32) as u16;
-                    let src1 = bits(word, 49, 41) as u16;
-                    let src2 = bits(word, 58, 50) as u16;
-                    let opsel_hi = bits(word, 60, 59) as u8;
-                    let neg = bits(word, 63, 61) as u8;
-                    Instruction::VOP3P { op, vdst, neg_hi, opsel, opsel_hi, opsel_hi2, cm, src0, src1, src2, neg }
-                }
-                0b0110 => {
-                    let offset0 = bits(word, 7, 0) as u8;
-                    let offset1 = bits(word, 15, 8) as u8;
-                    let gds = bits(word, 17, 17) != 0;
-                    let op = bits(word, 25, 18) as u8;
-                    let addr = bits(word, 39, 32) as u8;
-                    let data0 = bits(word, 47, 40) as u8;
-                    let data1 = bits(word, 55, 48) as u8;
-                    let vdst = bits(word, 63, 56) as u8;
-                    Instruction::DS { op, gds, offset1, offset0, vdst, data1, data0, addr }
-                }
-                0b0111 => {
-                    let offset = bits(word, 12, 0) as u16;
-                    let dlc = bits(word, 13, 13) != 0;
-                    let glc = bits(word, 14, 14) != 0;
-                    let slc = bits(word, 15, 15) != 0;
-                    let seg = bits(word, 17, 16) as u8;
-                    let op = bits(word, 24, 18) as u8;
-                    let addr = bits(word, 39, 32) as u8;
-                    let data = bits(word, 47, 40) as u8;
-                    let saddr = bits(word, 54, 48) as u8;
-                    let sve = bits(word, 55, 55) != 0;
-                    let vdst = bits(word, 63, 56) as u8;
-                    Instruction::FLAT { offset, dlc, glc, slc, seg, op, addr, data, saddr, sve, vdst }
-                },
-                0b0010 => {
-                    let srcx0 = bits(word, 8, 0) as u16;
-                    let vsrcx1 = bits(word, 16, 9) as u8;
-                    let opy = bits(word, 21, 17) as u8;
-                    let opx = bits(word, 25, 22) as u8;
-                    let srcy0 = bits(word, 40, 32) as u16;
-                    let vsrcy1 = bits(word, 48, 41) as u8;
-                    let vdsty = bits(word, 55, 49) as u8;
-                    let vdstx = bits(word, 63, 56) as u8;
-                    Instruction::VOPD { opx, opy, vdstx, vdsty, vsrcx1, vsrcy1, srcx0, srcy0 }
-                }
-                _ => todo!(),
-            }
-        }
-        0b10 => {
-            let ssrc0 = bits(word, 7, 0) as u8;
-            let ssrc1 = bits(word, 15, 8) as u8;
-            let simm16 = word as i16;
-            let sdst = bits(word, 22, 16) as u8;
-            match bits(word, 29, 23) {
-                0b1111101 => Instruction::SOP1 { ssrc0, sdst, op: bits(word, 15, 8) as u8 },
-                0b1111110 => Instruction::SOPC { ssrc0, ssrc1, op: bits(word, 22, 16) as u8 },
-                0b1111111 => Instruction::SOPP { simm16, op: bits(word, 22, 16) as u8 },
-                _ => {
-                    match bits(word, 29, 28) {
-                        0b11 => Instruction::SOPK { simm16, sdst, op: bits(word, 27, 23) as u8 },
-                        _ => Instruction::SOP2 { ssrc0, ssrc1, sdst, op: bits(word, 29, 23) as u8 }
-                    }
-                }
-            }
-        }
-        _ => {
-            let vdst = bits(word, 24, 17) as u8;
-            let src = bits(word, 8, 0) as u16;
-            let vsrc = bits(word, 16, 9) as u8;
-            match bits(word, 30, 25) {
-                0b111110 => Instruction::VOPC { vsrc, src, op: bits(word, 24, 17) as u8 },
-                0b111111 => Instruction::VOP1 { vdst, src, op: vsrc },
-                _ => Instruction::VOP2 { vdst, vsrc, src, op: bits(word, 30, 25) as u8 },
-            }
-        },
-    }
-}
-
-#[cfg(test)]
-mod test_rdna3 {
-    use super::*;
-
-    use std::process::{Stdio, Command};
-    use std::io::{Result, Write};
-
-    const LLVM_ARGS: &[&str; 3] = &["--arch=amdgcn", "--mcpu=gfx1100", "--triple=amdgcn-amd-amdhsa"];
-    const OFFSET_PRG: usize = 16;
-    const NULL: u8 = 124;
-
-    fn llvm_assemble(asm: &str) -> Result<Vec<u8>> {
-        let mut proc = Command::new("llvm-mc").args(LLVM_ARGS).args(["-filetype=obj", "-o", "-"]).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn()?;
-        proc.stdin.as_mut().unwrap().write_all(asm.as_bytes())?;
-        let out = proc.wait_with_output()?;
-        match out.status.success() {
-            true => Ok(out.stdout),
-            false => Err(std::io::Error::new(std::io::ErrorKind::Other, "llvm-mc err")),
-        }
-    }
-
-    fn llvm_disassemble(code: &Vec<u8>) -> Result<String> {
-        let mut proc = Command::new("llvm-objdump").args(LLVM_ARGS).args(["--disassemble", "-"]).stdin(Stdio::piped()).stdout(Stdio::piped()).spawn()?;
-        proc.stdin.as_mut().unwrap().write_all(code)?;
-        let out = proc.wait_with_output()?;
-        match out.status.success() {
-            true => Ok(String::from_utf8(out.stdout).unwrap()),
-            false => Err(std::io::Error::new(std::io::ErrorKind::Other, "llvm-objdump err")),
-        }
-    }
-
-    fn test_decode(asm: &str) -> Instruction {
-        let lib = llvm_assemble(asm).unwrap();
-        println!("{}", llvm_disassemble(&lib).unwrap());
-        let stream: Vec<u32> = lib.chunks_exact(4).map(|chunk| u32::from_le_bytes(chunk.try_into().unwrap())).skip(OFFSET_PRG).collect();
-        decode(stream[0], stream.get(1))
-    }
-
-    #[test]
-    fn test_decode_smem() {
-        assert_eq!(test_decode("s_load_b128 s[4:7], s[0:1], null"), Instruction::SMEM { op: 2, sdata: 4, sbase: 0, offset: 0, soffset: NULL, glc: false, dlc: false });
-        assert_eq!(test_decode("s_load_b32 s10, s[0:1], 0xc"), Instruction::SMEM { op: 0, sdata: 10, sbase: 0, offset: 0xc, soffset: NULL, glc: false, dlc: false });
-        assert_eq!(test_decode("s_load_b32 s0, s[4:5], s6"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: 6, glc: false, dlc: false });
-        assert_eq!(test_decode("s_load_b32 s0, s[4:5], glc dlc"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: NULL, glc: true, dlc: true });
-        assert_eq!(test_decode("s_load_b32 s0, s[4:5], glc"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: 0, soffset: NULL, glc: true, dlc: false });
-        assert_eq!(test_decode("s_load_b32 s0, s[4:5], -20"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: -20, soffset: NULL, glc: false, dlc: false });
-        assert_eq!(test_decode("s_load_b32 s0, s[4:5], -1048576"), Instruction::SMEM { op: 0, sdata: 0, sbase: 4, offset: -1048576, soffset: NULL, glc: false, dlc: false });
-    }
-
-    #[test]
-    fn test_decode_salu() {
-        assert_eq!(test_decode("s_add_u32 s1 s2 s3"), Instruction::SOP2 { op: 0, ssrc0: 2, ssrc1: 3, sdst: 1 });
-        assert_eq!(test_decode("s_add_u32 vcc_hi exec_lo vcc_lo"), Instruction::SOP2 { op: 0, ssrc0: 126, ssrc1: 106, sdst: 107 });
-        assert_eq!(test_decode("s_mov_b32 s1 -0.5"), Instruction::SOP1 { op: 0, ssrc0: 241, sdst: 1 });
-        assert_eq!(test_decode("s_cmpk_eq_i32 s0 -30"), Instruction::SOPK { op: 3, sdst: 0, simm16: -30 });
-        assert_eq!(test_decode("s_cmpk_eq_u32 s0 65535"), Instruction::SOPK { op: 9, sdst: 0, simm16: -1 });
-        assert_eq!(test_decode("s_cmp_ge_i32 s1 s2"), Instruction::SOPC { op: 3, ssrc0: 1, ssrc1: 2 });
-    }
-
-    #[test]
-    fn test_decode_valu_e32() {
-        assert_eq!(test_decode("v_mov_b32 v0, v0"), Instruction::VOP1 { op: 1, vdst: 0, src: 256 });
-        assert_eq!(test_decode("v_mov_b32 v0, s0"), Instruction::VOP1 { op: 1, vdst: 0, src: 0 });
-        assert_eq!(test_decode("v_cmp_t_f32 v1, v0"), Instruction::VOPC { op: 31, vsrc: 0, src: 257 });
-    }
-
-    #[test]
-    fn test_decode_valu_e64() {
-        assert_eq!(test_decode("v_log_f32_e64 v2, |v0|"), Instruction::VOP3 { op: 423, vdst: 2, src0: 256, src1: 0, src2: 0, abs: 0b001, neg: 0, opsel: 0, omod: 0, cm: false });
-        assert_eq!(test_decode("v_div_scale_f32 v2, s1, v0, v1, v2"), Instruction::VOP3SD { op: 764, cm: false, vdst: 2, sdst: 1, src0: 256, src1: 257, src2: 258, omod: 0, neg: 0 });
-        assert_eq!(test_decode("v_pk_add_i16 v1, v0, v2"), Instruction::VOP3P { op: 2, vdst: 1, neg_hi: 0, opsel: 0, opsel_hi: 3, opsel_hi2: true, cm: false, src2: 0, src1: 258, src0: 256, neg: 0 });
-    }
-
-    #[test]
-    fn test_decode_ds() {
-        assert_eq!(test_decode("ds_add_u32 v2, v4 offset:16"), Instruction::DS { op: 0, gds: false, offset1: 0, offset0: 0x10, vdst: 0, data1: 0, data0: 4, addr: 2 });
-        assert_eq!(test_decode("ds_store_b32 v0, v1, offset: 0x04 gds"), Instruction::DS { op: 13, gds: true, offset1: 0, offset0: 0x04, vdst: 0, data1: 0, data0: 1, addr: 0 });
-        assert_eq!(test_decode("ds_load_u8 v1, v0 offset:16"), Instruction::DS { op: 58, gds: false, offset1: 0, offset0: 16, vdst: 1, data1: 0, data0: 0, addr: 0 });
-    }
-}
--- a/extra/remu/src/state.rs
+++ b/extra/remu/src/state.rs
@ -1,272 +0,0 @@
-use std::ops::{Index, IndexMut};
-
-pub trait Register {
-    fn read64(&self, idx: usize) -> u64;
-    fn write64(&mut self, idx: usize, addr: u64);
-}
-impl<T> Register for T where T: Index<usize, Output = u32> + IndexMut<usize> {
-    fn read64(&self, idx: usize) -> u64 {
-        let lsb = self[idx] as u64;
-        let msb = self[idx + 1] as u64;
-        (msb << 32) | lsb
-    }
-
-    fn write64(&mut self, idx: usize, value: u64) {
-        self[idx] = (value & 0xffffffff) as u32;
-        self[idx + 1] = ((value & (0xffffffff << 32)) >> 32) as u32;
-    }
-}
-
-#[derive(Debug, Clone)]
-pub struct VGPR {
-    values: [[u32; 256]; 32],
-    pub default_lane: Option<usize>,
-}
-impl Index<usize> for VGPR {
-    type Output = u32;
-    fn index(&self, index: usize) -> &Self::Output {
-        &self.values[self.default_lane.unwrap()][index]
-    }
-}
-impl IndexMut<usize> for VGPR {
-    fn index_mut(&mut self, index: usize) -> &mut Self::Output {
-        &mut self.values[self.default_lane.unwrap()][index]
-    }
-}
-impl VGPR {
-    pub fn new() -> Self {
-        VGPR {
-            values: [[0; 256]; 32],
-            default_lane: None,
-        }
-    }
-    pub fn get_lane(&self, lane: usize) -> [u32; 256] {
-        *self.values.get(lane).unwrap()
-    }
-    pub fn get_lane_mut(&mut self, lane: usize) -> &mut [u32; 256] {
-        self.values.get_mut(lane).unwrap()
-    }
-}
-
-pub trait Value {
-    fn mut_hi16(&mut self, val: u16);
-    fn mut_lo16(&mut self, val: u16);
-}
-impl Value for u32 {
-    fn mut_hi16(&mut self, val: u16) {
-        *self = ((val as u32) << 16) | (*self as u16 as u32);
-    }
-    fn mut_lo16(&mut self, val: u16) {
-        *self = ((((*self & (0xffff << 16)) >> 16) as u32) << 16) | val as u32;
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct WaveValue {
-    pub value: u32,
-    pub warp_size: usize,
-    pub default_lane: Option<usize>,
-    pub mutations: Option<[bool; 32]>,
-}
-impl WaveValue {
-    pub fn new(value: u32, warp_size: usize) -> Self {
-        Self {
-            value,
-            warp_size,
-            default_lane: None,
-            mutations: None,
-        }
-    }
-    pub fn read(&self) -> bool {
-        (self.value >> self.default_lane.unwrap()) & 1 == 1
-    }
-    pub fn set_lane(&mut self, value: bool) {
-        if self.mutations.is_none() {
-            self.mutations = Some([false; 32])
-        }
-        self.mutations.as_mut().unwrap()[self.default_lane.unwrap()] = value;
-    }
-    pub fn apply_muts(&mut self) {
-        self.value = 0;
-        for lane in 0..self.warp_size {
-            if self.mutations.unwrap()[lane] {
-                self.value |= 1 << lane;
-            }
-        }
-    }
-}
-
-/// C-compatible state snapshot for FFI - used for comparing emulator states
-#[repr(C)]
-#[derive(Clone, Debug)]
-pub struct StateSnapshot {
-    pub pc: u32,
-    pub scc: u32,
-    pub vcc: u32,
-    pub exec_mask: u32,
-    pub sgpr: [u32; 128],
-    pub vgpr: [[u32; 256]; 32],
-}
-
-impl StateSnapshot {
-    pub fn new() -> Self {
-        Self { pc: 0, scc: 0, vcc: 0, exec_mask: 0, sgpr: [0; 128], vgpr: [[0; 256]; 32] }
-    }
-}
-
-#[derive(Clone, Debug)]
-pub struct VecDataStore {
-    pub data: Vec<u8>,
-}
-
-impl VecDataStore {
-    pub fn new() -> Self {
-        Self { data: Vec::new() }
-    }
-    pub fn write(&mut self, addr: usize, val: u32) {
-        if addr + 4 >= self.data.len() {
-            self.data.resize(self.data.len() + addr + 5, 0);
-        }
-        self.data[addr..addr + 4].iter_mut().enumerate().for_each(|(i, x)| {
-            *x = val.to_le_bytes()[i];
-        });
-    }
-    pub fn write64(&mut self, addr: usize, val: u64) {
-        self.write(addr, (val & 0xffffffff) as u32);
-        self.write(addr + 4, ((val & (0xffffffff << 32)) >> 32) as u32);
-    }
-    pub fn read(&self, addr: usize) -> u32 {
-        let mut bytes: [u8; 4] = [0; 4];
-        bytes.copy_from_slice(&self.data[addr + 0..addr + 4]);
-        u32::from_le_bytes(bytes)
-    }
-    pub fn read64(&mut self, addr: usize) -> u64 {
-        let lsb = self.read(addr);
-        let msb = self.read(addr + 4);
-        ((msb as u64) << 32) | lsb as u64
-    }
-}
-
-#[cfg(test)]
-mod test_state {
-    use super::*;
-
-    #[test]
-    fn test_wave_value() {
-        let mut val = WaveValue::new(0b11000000000000011111111111101110, 32);
-        val.default_lane = Some(0);
-        assert!(!val.read());
-        val.default_lane = Some(31);
-        assert!(val.read());
-    }
-
-    #[test]
-    fn test_wave_value_small() {
-        let mut val = WaveValue::new(0, 1);
-        val.default_lane = Some(0);
-        assert!(!val.read());
-        assert_eq!(val.value, 0);
-        val.set_lane(true);
-        val.apply_muts();
-        assert!(val.read());
-        assert_eq!(val.value, 1);
-    }
-
-    #[test]
-    fn test_wave_value_small_alt() {
-        let mut val = WaveValue::new(0, 2);
-        val.default_lane = Some(0);
-        assert!(!val.read());
-        assert_eq!(val.value, 0);
-        val.set_lane(true);
-        val.apply_muts();
-        assert!(val.read());
-        assert_eq!(val.value, 1);
-    }
-
-    #[test]
-    fn test_wave_value_exec() {
-        let warp_size = 32;
-        let val = WaveValue::new(u32::MAX, warp_size);
-        assert_eq!(val.value, u32::MAX);
-        let warp_size = 3;
-        let val = WaveValue::new((1 << warp_size) - 1, warp_size);
-        assert_eq!(val.value, 7)
-    }
-
-    #[test]
-    fn test_wave_value_toggle_one() {
-        let warp_size = 2;
-        let mut val = WaveValue::new(0b11, warp_size);
-        // 0
-        val.default_lane = Some(0);
-        val.set_lane(false);
-        // 1
-        val.default_lane = Some(1);
-        val.set_lane(true);
-        val.apply_muts();
-        assert_eq!(val.value, 2);
-    }
-
-    #[test]
-    fn test_wave_value_mutate_small() {
-        let mut val = WaveValue::new(0, 2);
-        val.default_lane = Some(0);
-        assert!(!val.read());
-        assert_eq!(val.value, 0);
-        val.set_lane(true);
-        val.apply_muts();
-        assert!(val.read());
-        assert_eq!(val.value, 1);
-    }
-
-    #[test]
-    fn test_wave_value_mutations() {
-        let mut val = WaveValue::new(0b10001, 32);
-        val.default_lane = Some(0);
-        val.set_lane(false);
-        assert!(val.mutations.unwrap().iter().all(|x| !x));
-        val.default_lane = Some(1);
-        val.set_lane(true);
-        assert_eq!(val.value, 0b10001);
-        assert_eq!(
-            val.mutations,
-            Some([
-                false, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false,
-                false, false, false, false, false, false, false, false, false, false, false, false, false,
-            ])
-        );
-
-        val.apply_muts();
-        assert_eq!(val.value, 0b10);
-    }
-
-    #[test]
-    fn test_write16() {
-        let mut vgpr = VGPR::new();
-        vgpr.default_lane = Some(0);
-        vgpr[0] = 0b11100000000000001111111111111111;
-        vgpr[0].mut_lo16(0b1011101111111110);
-        assert_eq!(vgpr[0], 0b11100000000000001011101111111110);
-    }
-
-    #[test]
-    fn test_write16hi() {
-        let mut vgpr = VGPR::new();
-        vgpr.default_lane = Some(0);
-        vgpr[0] = 0b11100000000000001111111111111111;
-        vgpr[0].mut_hi16(0b1011101111111110);
-        assert_eq!(vgpr[0], 0b10111011111111101111111111111111);
-    }
-
-    #[test]
-    fn test_vgpr() {
-        let mut vgpr = VGPR::new();
-        vgpr.default_lane = Some(0);
-        vgpr[0] = 42;
-        vgpr.default_lane = Some(10);
-        vgpr[0] = 10;
-        assert_eq!(vgpr.get_lane(0)[0], 42);
-        assert_eq!(vgpr.get_lane(10)[0], 10);
-    }
-}
--- a/extra/remu/src/thread.rs
+++ b/extra/remu/src/thread.rs
--- a/extra/remu/src/work_group.rs
+++ b/extra/remu/src/work_group.rs
@ -1,323 +0,0 @@
-use crate::helpers::{colored, DEBUG};
-use crate::state::{Register, StateSnapshot, VecDataStore, WaveValue, VGPR};
-use crate::thread::{Thread, END_PRG, SGPR_COUNT};
-use std::collections::HashMap;
-
-pub const WAVE_SIZE: usize = 32;
-
-pub struct WorkGroup<'a> {
-    dispatch_dim: u32,
-    id: [u32; 3],
-    lds: VecDataStore,
-    kernel: &'a Vec<u32>,
-    kernel_args: *const u64,
-    launch_bounds: [u32; 3],
-    wave_state: HashMap<usize, WaveState>,
-}
-
-#[derive(Debug, Clone)]
-struct WaveState {
-    scalar_reg: [u32; SGPR_COUNT],
-    scc: u32,
-    vcc: WaveValue,
-    exec: WaveValue,
-    vec_reg: VGPR,
-    pc: usize,
-    sds: HashMap<usize, VecDataStore>,
-}
-
-const SYNCS: [u32; 4] = [0xBF89FC07, 0xBC7C0000, 0xBF890007, 0xbFB60003];
-const S_BARRIER: u32 = 0xBFBD0000;
-
-/// Context for single-stepping through a wave - holds all mutable state
-pub struct WaveContext {
-    pub kernel: Vec<u32>,
-    pub scalar_reg: [u32; SGPR_COUNT],
-    pub scc: u32,
-    pub pc: usize,
-    pub vec_reg: VGPR,
-    pub vcc: WaveValue,
-    pub exec: WaveValue,
-    pub lds: VecDataStore,
-    pub sds: HashMap<usize, VecDataStore>,
-    pub n_lanes: usize,
-}
-
-impl WaveContext {
-    pub fn new(kernel: Vec<u32>, n_lanes: usize) -> Self {
-        let active = (!0u32).wrapping_shr(32 - (n_lanes as u32));
-        Self {
-            kernel,
-            scalar_reg: [0; SGPR_COUNT],
-            scc: 0,
-            pc: 0,
-            vec_reg: VGPR::new(),
-            vcc: WaveValue::new(0, n_lanes),
-            exec: WaveValue::new(active, n_lanes),
-            lds: VecDataStore::new(),
-            sds: (0..=31).map(|i| (i, VecDataStore::new())).collect(),
-            n_lanes,
-        }
-    }
-
-    /// Execute a single instruction. Returns: 0=continue, -1=endpgm, -2=barrier, 1=done (pc past program), negative=error
-    pub fn step(&mut self) -> i32 {
-        if self.pc >= self.kernel.len() { return 1; }
-        if self.kernel[self.pc] == END_PRG { return -1; }
-        if self.kernel[self.pc] == S_BARRIER { self.pc += 1; return -2; }
-        // Skip sync/nop instructions
-        if SYNCS.contains(&self.kernel[self.pc]) || self.kernel[self.pc] >> 20 == 0xbf8 || self.kernel[self.pc] == 0x7E000000 {
-            self.pc += 1;
-            return 0;
-        }
-
-        let mut sgpr_co = None;
-        for lane_id in 0..self.n_lanes {
-            self.vec_reg.default_lane = Some(lane_id);
-            self.vcc.default_lane = Some(lane_id);
-            self.exec.default_lane = Some(lane_id);
-            let mut thread = Thread {
-                scalar_reg: &mut self.scalar_reg,
-                scc: &mut self.scc,
-                vec_reg: &mut self.vec_reg,
-                vcc: &mut self.vcc,
-                exec: &mut self.exec,
-                lds: &mut self.lds,
-                sds: &mut self.sds.get_mut(&lane_id).unwrap(),
-                pc_offset: 0,
-                stream: self.kernel[self.pc..].to_vec(),
-                scalar: false,
-                simm: None,
-                warp_size: self.n_lanes,
-                sgpr_co: &mut sgpr_co,
-            };
-            if let Err(e) = thread.interpret() { return e; }
-            if thread.scalar {
-                self.pc = ((self.pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
-                break;
-            }
-            if lane_id == self.n_lanes - 1 {
-                self.pc = ((self.pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
-            }
-        }
-        if self.vcc.mutations.is_some() { self.vcc.apply_muts(); self.vcc.mutations = None; }
-        if self.exec.mutations.is_some() { self.exec.apply_muts(); self.exec.mutations = None; }
-        if let Some((idx, mut wv)) = sgpr_co.take() { wv.apply_muts(); self.scalar_reg[idx] = wv.value; }
-        0
-    }
-
-    pub fn get_snapshot(&self) -> StateSnapshot {
-        let mut snap = StateSnapshot::new();
-        snap.pc = self.pc as u32;
-        snap.scc = self.scc;
-        snap.vcc = self.vcc.value;
-        snap.exec_mask = self.exec.value;
-        snap.sgpr = self.scalar_reg;
-        for lane in 0..32 { snap.vgpr[lane] = self.vec_reg.get_lane(lane); }
-        snap
-    }
-}
-
-impl<'a> WorkGroup<'a> {
-    pub fn new(dispatch_dim: u32, id: [u32; 3], launch_bounds: [u32; 3], kernel: &'a Vec<u32>, kernel_args: *const u64) -> Self {
-        Self { dispatch_dim, id, kernel, launch_bounds, kernel_args, lds: VecDataStore::new(), wave_state: HashMap::new() }
-    }
-
-    pub fn exec_waves(&mut self) -> Result<(), i32> {
-        let mut threads = vec![];
-        for z in 0..self.launch_bounds[2] {
-            for y in 0..self.launch_bounds[1] {
-                for x in 0..self.launch_bounds[0] {
-                    threads.push([x, y, z])
-                }
-            }
-        }
-        let waves = threads.chunks(WAVE_SIZE).collect::<Vec<_>>();
-
-        let mut sync = false;
-        for (i, x) in self.kernel.iter().enumerate() {
-            if i != 0 && *x == S_BARRIER {
-                sync = true;
-                break;
-            }
-        }
-
-        for _ in 0..=(sync as usize) {
-            for w in waves.iter().enumerate() {
-                self.exec_wave(w)?
-            }
-        }
-        Ok(())
-    }
-
-    fn exec_wave(&mut self, (wave_id, threads): (usize, &&[[u32; 3]])) -> Result<(), i32> {
-        let (mut scalar_reg, mut scc, mut pc, mut vec_reg, mut vcc, mut exec, mut sds) = match self.wave_state.get(&wave_id) {
-          None => {
-            let mut scalar_reg = [0; SGPR_COUNT];
-            scalar_reg.write64(0, self.kernel_args as u64);
-
-            let [gx, gy, gz] = self.id;
-            match self.dispatch_dim {
-              3 => (scalar_reg[13], scalar_reg[14], scalar_reg[15]) = (gx, gy, gz),
-              2 => (scalar_reg[14], scalar_reg[15]) = (gx, gy),
-              _ => scalar_reg[15] = gx,
-            }
-
-            let mut vec_reg = VGPR::new();
-            for (t, [x, y, z]) in threads.iter().enumerate() {
-              vec_reg.get_lane_mut(t)[0] = match &self.launch_bounds {
-                [_, 1, 1] => *x,
-                _ => (z << 20) | (y << 10) | x,
-              }
-            }
-
-            let vcc = WaveValue::new(0, threads.len());
-            let active = (!0u32).wrapping_shr(32 - (threads.len() as u32));
-            let exec = WaveValue::new(active, threads.len());
-
-            let sds = (0..=31).map(|i| (i, VecDataStore::new())).collect();
-            (scalar_reg, 0, 0, vec_reg, vcc, exec, sds)
-          }
-
-          Some(val) => {
-            let val = val.clone();
-            (val.scalar_reg, val.scc, val.pc, val.vec_reg, val.vcc, val.exec, val.sds)
-          }
-        };
-
-        loop {
-            if self.kernel[pc] == END_PRG {
-                break Ok(());
-            }
-            if self.kernel[pc] == S_BARRIER && self.wave_state.get(&wave_id).is_none() {
-                self.wave_state.insert(wave_id, WaveState { scalar_reg, scc, vec_reg, vcc, exec, pc, sds });
-                break Ok(());
-            }
-            if self.kernel[pc] == S_BARRIER || SYNCS.contains(&self.kernel[pc]) || self.kernel[pc] >> 20 == 0xbf8 || self.kernel[pc] == 0x7E000000 {
-                pc += 1;
-                continue;
-            }
-
-            let mut sgpr_co = None;
-            for (lane_id, [x, y, z]) in threads.iter().enumerate() {
-                vec_reg.default_lane = Some(lane_id);
-                vcc.default_lane = Some(lane_id);
-                exec.default_lane = Some(lane_id);
-                if *DEBUG {
-                    let lane = format!("{:<2} {:08X} ", lane_id, self.kernel[pc]);
-                    let state = match exec.read() {
-                        true => "green",
-                        false => "gray",
-                    };
-                    let [id0, id1, id2] = self.id;
-                    print!("[{id0:<3} {id1:<3} {id2:<3}] [{x:<3} {y:<3} {z:<3}] {}", colored(&lane, state));
-                }
-                let mut thread = Thread {
-                    scalar_reg: &mut scalar_reg,
-                    scc: &mut scc,
-                    vec_reg: &mut vec_reg,
-                    vcc: &mut vcc,
-                    exec: &mut exec,
-                    lds: &mut self.lds,
-                    sds: &mut sds.get_mut(&lane_id).unwrap(),
-                    pc_offset: 0,
-                    stream: self.kernel[pc..self.kernel.len()].to_vec(),
-                    scalar: false,
-                    simm: None,
-                    warp_size: threads.len(),
-                    sgpr_co: &mut sgpr_co,
-                };
-                thread.interpret()?;
-                if *DEBUG {
-                    println!();
-                }
-                if thread.scalar {
-                    pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
-                    break;
-                }
-                if lane_id == threads.len() - 1 {
-                    pc = ((pc as isize) + 1 + (thread.pc_offset as isize)) as usize;
-                }
-            }
-
-            if vcc.mutations.is_some() {
-                vcc.apply_muts();
-                vcc.mutations = None;
-            }
-            if exec.mutations.is_some() {
-                exec.apply_muts();
-                exec.mutations = None;
-            }
-            if let Some((idx, mut wv)) = sgpr_co.take() {
-                wv.apply_muts();
-                scalar_reg[idx] = wv.value;
-            }
-        }
-    }
-}
-
-#[cfg(test)]
-mod test_workgroup {
-    use super::*;
-
-    // TODO: make this generic by adding the assembler
-    fn global_store_sgpr(addr: u64, instructions: Vec<u32>, src: u32) -> Vec<u32> {
-        [
-            instructions,
-            vec![
-                0x7E020200 + src,
-                0x7E0402FF,
-                addr as u32,
-                0x7E0602FF,
-                (addr >> 32) as u32,
-                0xDC6A0000,
-                0x007C0102,
-            ],
-            vec![END_PRG],
-        ]
-        .concat()
-    }
-
-    #[test]
-    fn test_wave_value_state_vcc() {
-        let mut ret: u32 = 0;
-        let kernel = vec![
-            0xBEEA00FF,
-            0b11111111111111111111111111111111, // initial vcc state
-            0x7E140282,
-            0x7C94010A, // cmp blockDim.x == 2
-        ];
-        let addr = (&mut ret as *mut u32) as u64;
-        let kernel = global_store_sgpr(addr, kernel, 106);
-        let mut wg = WorkGroup::new(1, [0, 0, 0], [3, 1, 1], &kernel, [addr].as_ptr());
-        wg.exec_waves().unwrap();
-        assert_eq!(ret, 0b100);
-    }
-
-    #[test]
-    fn test_wave_value_state_exec() {
-        let mut ret: u32 = 0;
-        let kernel = vec![
-            0xBEFE00FF,
-            0b11111111111111111111111111111111,
-            0x7E140282,
-            0x7D9C010A, // cmpx blockDim.x <= 2
-        ];
-        let addr = (&mut ret as *mut u32) as u64;
-        let kernel = global_store_sgpr(addr, kernel, 126);
-        let mut wg = WorkGroup::new(1, [0, 0, 0], [4, 1, 1], &kernel, [addr].as_ptr());
-        wg.exec_waves().unwrap();
-        assert_eq!(ret, 0b0111);
-    }
-
-    #[test]
-    fn test_wave_value_sgpr_co() {
-        let mut ret: u32 = 0;
-        let kernel = vec![0xBE8D00FF, 0x7FFFFFFF, 0x7E1402FF, u32::MAX, 0xD700000A, 0x0002010A];
-        let addr = (&mut ret as *mut u32) as u64;
-        let kernel = global_store_sgpr(addr, kernel, 0);
-        let mut wg = WorkGroup::new(1, [0, 0, 0], [5, 1, 1], &kernel, [addr].as_ptr());
-        wg.exec_waves().unwrap();
-        assert_eq!(ret, 0b11110);
-    }
-}
--- a/extra/remu/test/hwtest.py
+++ b/extra/remu/test/hwtest.py
@ -1,155 +0,0 @@
-# ruff: noqa: F405, F403
-# allow define from star imports
-
-import numpy as np
-import unittest
-import subprocess, struct, math, functools
-from tinygrad import Tensor, dtypes, Device
-from tinygrad.helpers import getenv
-
-from tinygrad.runtime.autogen.amd.rdna3.ins import *
-from tinygrad.renderer.amd.asm import waitcnt
-
-from test.testextra.test_cfg_viz import asm_kernel
-
-def get_output(asm:list, n_threads:int=1, vdst:VGPR=v[1]):
-  out = Tensor([0]*n_threads, dtype=dtypes.uint32).realize()
-  insts = [
-    s_load_b64(s[0:1], s[0:1], NULL),
-    *asm,
-    v_lshlrev_b32_e32(v[0], 2, v[0]),
-    s_waitcnt(simm16=waitcnt(lgkmcnt=0)),
-    #global_store_b32(v[0], v[1], s[0:1]),
-    global_store_b32(addr=v[0], data=vdst, saddr=s[0:1]),
-    s_endpgm()
-  ]
-  out = Tensor.custom_kernel(out, fxn=functools.partial(asm_kernel, name="test", insts=insts, device=out.device, n_threads=n_threads))[0]
-  out.realize()
-  return out.tolist()
-
-def f16_to_bits(x:float) -> int: return struct.unpack('<H', struct.pack('<e', x))[0]
-def f32_from_bits(x:int) -> float: return struct.unpack('<f', struct.pack('<I', x))[0]
-def f32_to_bits(x:float) -> int: return struct.unpack('<I', struct.pack('<f', x))[0]
-
-@unittest.skipUnless(Device.DEFAULT == "AMD", "tests RDNA3")
-class TestHW(unittest.TestCase):
-  def setUp(self):
-    if getenv("MOCKGPU"): subprocess.run(["cargo", "build", "--release", "--manifest-path", "./extra/remu/Cargo.toml"], check=True)
-
-  def test_simple_v_mov(self):
-    out = get_output([
-      v_mov_b32_e32(v[1], 2),
-    ])
-    self.assertEqual(out, [2])
-
-  def test_simple_s_mov(self):
-    out = get_output([
-      s_mov_b32(s[7], 0x7fffffff),
-      v_mov_b32_e32(v[1], s[7]),
-    ])
-    self.assertEqual(out, [0x7fffffff])
-
-  def test_exec_mov(self):
-    out = get_output([
-      v_mov_b32_e32(v[1], 42),
-      s_mov_b32(EXEC_LO, 0b10),
-      v_mov_b32_e32(v[1], 10),
-      s_mov_b32(EXEC_LO, 0b11),
-    ], n_threads=2)
-    np.testing.assert_equal(out, [42, 10])
-
-  def test_exec_cmp_vopc(self):
-    out = get_output([
-      s_mov_b32(VCC_LO, 0), # reset vcc
-      v_mov_b32_e32(v[1], 42),
-      v_mov_b32_e32(v[2], 10),
-      s_mov_b32(EXEC_LO, 0b01),
-      v_cmp_ne_u32_e32(v[1], v[2]),
-      s_mov_b32(EXEC_LO, 0b11),
-      v_mov_b32_e32(v[1], VCC_LO),
-    ], n_threads=2)[0]
-    np.testing.assert_equal(out, 1)
-
-  def test_exec_cmpx_vop3(self):
-    out = get_output([
-      s_mov_b32(EXEC_LO, 0b11),
-      v_mov_b32_e32(v[1], 42),
-      v_mov_b32_e32(v[2], 10),
-      s_mov_b32(EXEC_LO, 0b01),
-      v_cmpx_ne_u32_e32(v[1], v[2]),
-      s_mov_b32(s[10], EXEC_LO),
-      s_mov_b32(EXEC_LO, 0b11),
-      v_mov_b32_e32(v[1], s[10]),
-    ], n_threads=2)[0]
-    np.testing.assert_equal(out & 0b11, 0b01)
-
-  def test_fmac_vop3_modifier(self):
-    init_state = [
-      v_mov_b32_e32(a:=v[1], f16_to_bits(4.0)),
-      v_mov_b32_e32(b:=v[2], f16_to_bits(3.0)),
-      v_mov_b32_e32(c:=v[3], f16_to_bits(2.0)),
-    ]
-    def run_fmac(a, b): return get_output(init_state+[v_fmac_f16_e64(c, a, b)], vdst=c)[0]
-    self.assertEqual(run_fmac(a, b), f16_to_bits(14.0))
-    self.assertEqual(run_fmac(a, -b), f16_to_bits(-10.0))
-    self.assertEqual(run_fmac(-a, -b), f16_to_bits(14.0))
-
-  def test_s_abs_i32(self):
-    def check(x, y, dst=s[10], scc=0):
-      for reg,val in [(dst, y), (SCC, scc)]:
-        self.assertEqual(get_output([
-          s_mov_b32(dst, x),
-          s_abs_i32(dst, dst),
-          v_mov_b32_e32(v[1], reg)
-        ])[0], val)
-
-    check(0x00000001, 0x00000001, scc=1)
-    check(0x7fffffff, 0x7fffffff, scc=1)
-    check(0x80000000, 0x80000000, scc=1)
-    check(0x80000001, 0x7fffffff, scc=1)
-    check(0x80000002, 0x7ffffffe, scc=1)
-    check(0xffffffff, 0x00000001, scc=1)
-    check(0, 0, scc=0)
-
-  def test_v_rcp_f32_neg_vop3(self):
-    def v_neg_rcp_f32(x:float, y:float):
-      out = get_output([
-        v_mov_b32_e32(v[2], f32_to_bits(x)),
-        v_rcp_f32_e64(v[2], -v[2]),
-      ], vdst=v[2])[0]
-      assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
-
-    v_neg_rcp_f32(math.inf, -0.0)
-    v_neg_rcp_f32(-math.inf, 0.0)
-    v_neg_rcp_f32(0.0, -math.inf)
-    v_neg_rcp_f32(-0.0, math.inf)
-    v_neg_rcp_f32(-2.0, 0.5)
-    v_neg_rcp_f32(2.0, -0.5)
-
-  def test_v_cndmask_b32_neg(self):
-    def v_neg(x:float, y:float):
-      out = get_output([
-        v_mov_b32_e32(v[1], f32_to_bits(x)),
-        s_mov_b32(s[10], 1),
-        v_cndmask_b32_e64(v[1], v[1], -v[1], s[10]),
-      ])[0]
-      assert out == f32_to_bits(y), f"{f32_from_bits(out)} != {y} / {out} != {f32_to_bits(y)}"
-
-    v_neg(-0.0, 0.0)
-    v_neg(0.0, -0.0)
-    v_neg(2.0, -2.0)
-    v_neg(math.inf, -math.inf)
-    v_neg(-math.inf, math.inf)
-
-  @unittest.skip("how does VOPD work in the dsl")
-  def test_v_subrev_wrap(self):
-    out = get_output([
-      #v_dual_mov_b32(v[1], 0xffffffff, v[2], 0x0),
-      #v_dual_mov_b32(vdstx=v[1], srcx=0xffffffff, vdsty=v[2], srcy=0x0),
-      #VOPD(opx=VOPDOp.V_DUAL_MOV_B32, opy=VOPDOp.V_DUAL_MOV_B32, vdstx=v[1], srcx=0xffffffff, vdsty=v[2], srcy=0x0),
-      v_subrev_co_u32(v[2], VCC_LO, v[2], v[1]),
-    ], vdst=v[2])[0]
-    self.assertEqual(out, 0xffff_ffff)
-
-if __name__ == "__main__":
-  unittest.main()
--- a/extra/setup_mock_amd_osx.sh
+++ b/extra/setup_mock_amd_osx.sh
@ -3,7 +3,7 @@ INSTALL_PATH="${1:-/opt/homebrew/lib}"
 if [ ! -d "$INSTALL_PATH" ]; then
    USER=$(whoami)
    echo "No path $INSTALL_PATH. Will create. Might need your password..."
-    echo "You can stop now and provide any location as an argument where you want to save the libs (note, that not default locations should be in LD_LIBRARY_PATH, so tinygrad can find the libs)."
+    echo "You can stop now and provide any location as an argument where you want to save the library (note, that not default locations should be in LD_LIBRARY_PATH, so tinygrad can find it)."
    echo "Press any key or symbol to continue..."
    read -n 1 -s

@ -11,11 +11,6 @@ if [ ! -d "$INSTALL_PATH" ]; then
    sudo chown -R "$USER":staff "$INSTALL_PATH"
 fi

-# Download libremu.dylib
-curl -s https://api.github.com/repos/Qazalin/remu/releases/latest | \
-    jq -r '.assets[] | select(.name == "libremu.dylib").browser_download_url' | \
-    xargs curl -L -o $INSTALL_PATH/libremu.dylib
-
 # Download libamd_comgr.dylib
 curl -s https://api.github.com/repos/tinygrad/amdcomgr_dylib/releases/latest | \
    jq -r '.assets[] | select(.name == "libamd_comgr.dylib").browser_download_url' | \
--- a/test/amd/test_compare_emulators.py
+++ b/test/amd/test_compare_emulators.py
@ -1,528 +0,0 @@
-# Test to compare Python and Rust RDNA3 emulators by running real tinygrad kernels
-import unittest, ctypes
-from dataclasses import dataclass
-from pathlib import Path
-from tinygrad import Device
-
-from test.mockgpu.amd.emu import WaveState, _decode_at, WAVE_SIZE, VCC_LO, EXEC_LO, SCC
-from tinygrad.renderer.amd import decode_inst
-import tinygrad
-REMU_PATH = Path(tinygrad.__file__).parent.parent / "extra/remu/target/release/libremu.so"
-if not REMU_PATH.exists(): REMU_PATH = Path(tinygrad.__file__).parent.parent / "extra/remu/target/release/libremu.dylib"
-
-def set_valid_mem_ranges(ranges): pass  # emu2 doesn't need this
-
-def _is_f32_nan(bits: int) -> bool:
-  """Check if 32-bit value is a NaN (exponent all 1s, mantissa non-zero)."""
-  return (bits & 0x7f800000) == 0x7f800000 and (bits & 0x007fffff) != 0
-
-def _vals_equal(a: int, b: int) -> bool:
-  """Compare two 32-bit values, treating all NaN bit patterns as equal."""
-  if a == b: return True
-  return _is_f32_nan(a) and _is_f32_nan(b)
-
-@dataclass
-class KernelSnapshot:
-  code: bytes
-  src: str
-  global_size: tuple[int, int, int]
-  local_size: tuple[int, int, int]
-  buf_idxs: list[int]  # indices into shared buffer pool
-  buf_sizes: list[int]  # sizes for each buffer index
-
-@dataclass
-class StateSnapshot:
-  pc: int
-  scc: int
-  vcc: int
-  exec_mask: int
-  sgpr: list[int]
-  vgpr: list[list[int]]
-
-  def diff(self, other: 'StateSnapshot', n_lanes: int, arrow: str = " vs ") -> list[str]:
-    """Return list of differences between two states."""
-    diffs = []
-    if self.pc != other.pc: diffs.append(f"pc: {self.pc}{arrow}{other.pc}")
-    if self.scc != other.scc: diffs.append(f"scc: {self.scc}{arrow}{other.scc}")
-    if self.vcc != other.vcc: diffs.append(f"vcc: 0x{self.vcc:08x}{arrow}0x{other.vcc:08x}")
-    if self.exec_mask != other.exec_mask: diffs.append(f"exec: 0x{self.exec_mask:08x}{arrow}0x{other.exec_mask:08x}")
-    for i, (a, b) in enumerate(zip(self.sgpr, other.sgpr)):
-      # Skip VCC_LO/HI (106/107) and EXEC_LO/HI (126/127) as they alias vcc/exec_mask which are compared separately
-      if i in (106, 107, 126, 127): continue
-      if not _vals_equal(a, b): diffs.append(f"sgpr[{i}]: 0x{a:08x}{arrow}0x{b:08x}")
-    for lane in range(n_lanes):
-      for i, (a, b) in enumerate(zip(self.vgpr[lane], other.vgpr[lane])):
-        if not _vals_equal(a, b): diffs.append(f"vgpr[{lane}][{i}]: 0x{a:08x}{arrow}0x{b:08x}")
-    return diffs
-
-class CStateSnapshot(ctypes.Structure):
-  _fields_ = [("pc", ctypes.c_uint32), ("scc", ctypes.c_uint32), ("vcc", ctypes.c_uint32), ("exec_mask", ctypes.c_uint32),
-              ("sgpr", ctypes.c_uint32 * 128), ("vgpr", (ctypes.c_uint32 * 256) * 32)]
-
-  def to_snapshot(self) -> StateSnapshot:
-    return StateSnapshot(pc=self.pc, scc=self.scc, vcc=self.vcc, exec_mask=self.exec_mask,
-                         sgpr=list(self.sgpr), vgpr=[list(self.vgpr[i]) for i in range(32)])
-
-class RustEmulator:
-  def __init__(self):
-    self.lib = ctypes.CDLL(str(REMU_PATH))
-    self.lib.wave_create.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32]
-    self.lib.wave_create.restype = ctypes.c_void_p
-    self.lib.wave_step.argtypes = [ctypes.c_void_p]
-    self.lib.wave_step.restype = ctypes.c_int32
-    self.lib.wave_get_snapshot.argtypes = [ctypes.c_void_p, ctypes.POINTER(CStateSnapshot)]
-    self.lib.wave_set_sgpr.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32]
-    self.lib.wave_set_vgpr.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32]
-    self.lib.wave_init_lds.argtypes = [ctypes.c_void_p, ctypes.c_uint32]
-    self.lib.wave_free.argtypes = [ctypes.c_void_p]
-    self.ctx = None
-
-  def create(self, kernel: bytes, n_lanes: int):
-    kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
-    self.ctx = self.lib.wave_create(ctypes.addressof(kernel_buf), len(kernel), n_lanes)
-    self._kernel_buf = kernel_buf
-
-  def step(self) -> int: return self.lib.wave_step(self.ctx)
-  def set_sgpr(self, idx: int, val: int): self.lib.wave_set_sgpr(self.ctx, idx, val)
-  def set_vgpr(self, lane: int, idx: int, val: int): self.lib.wave_set_vgpr(self.ctx, lane, idx, val)
-  def init_lds(self, size: int): self.lib.wave_init_lds(self.ctx, size)
-
-  def get_snapshot(self) -> StateSnapshot:
-    snap = CStateSnapshot()
-    self.lib.wave_get_snapshot(self.ctx, ctypes.byref(snap))
-    return snap.to_snapshot()
-
-  def free(self):
-    if self.ctx:
-      self.lib.wave_free(self.ctx)
-      self.ctx = None
-
-class PythonEmulator:
-  def __init__(self):
-    self.state: WaveState | None = None
-    self.program: dict[int, tuple] = {}  # lazily populated: pc -> (name, fxn, globals)
-    self.vmem_buf = None
-    self.lds_buf = None
-    self.kernel_buf = None  # Keep kernel bytes alive
-    self.lib_addr = 0  # Base address of kernel code
-
-  def create(self, kernel: bytes, n_lanes: int):
-    import ctypes
-    from tinygrad.device import Buffer, BufferSpec
-    from tinygrad.dtype import dtypes
-    # Store kernel in a ctypes buffer so _decode_at can read from memory at actual PC address
-    self.kernel_buf = (ctypes.c_char * len(kernel)).from_buffer_copy(kernel)
-    self.lib_addr = ctypes.addressof(self.kernel_buf)
-    self.program = {}
-    self.state = WaveState(n_lanes)
-    self.state.pc = self.lib_addr  # Set PC to code base address
-    self.vmem_buf = Buffer('CPU', 1 << 40, dtypes.uint32, options=BufferSpec(external_ptr=0)).ensure_allocated()
-    self.lds_buf = Buffer('CPU', 65536 // 4, dtypes.uint32).ensure_allocated()
-
-  def _ensure_decoded(self, pc: int):
-    if pc not in self.program:
-      runner, _ = _decode_at(pc, "rdna3")
-      self.program[pc] = (runner.p.function_name, runner._prg.fxn, runner.p.globals)
-
-  def step(self) -> int:
-    import ctypes
-    assert self.state is not None
-    pc = self.state.pc
-    if pc == 0xFFFFFFFFFFFFFFFF: return -1
-    self._ensure_decoded(pc)
-    name, fxn, globals_list = self.program[pc]
-    buf_addrs = {0: self.state.sgpr_buf._buf.va_addr, 1: self.state.vgpr_buf._buf.va_addr,  # type: ignore[union-attr]
-                 2: self.vmem_buf._buf.va_addr, 3: self.lds_buf._buf.va_addr}  # type: ignore[union-attr]
-    fxn(*[ctypes.c_uint64(buf_addrs[g]) for g in globals_list], ctypes.c_int32(0))
-    return -1 if self.state.pc == 0xFFFFFFFFFFFFFFFF else 0
-
-  def set_sgpr(self, idx: int, val: int):
-    assert self.state is not None
-    self.state._write_sgpr(idx, val)
-  def set_vgpr(self, lane: int, idx: int, val: int):
-    assert self.state is not None
-    self.state._write_vgpr(idx, lane, val)
-
-  def get_snapshot(self) -> StateSnapshot:
-    assert self.state is not None
-    sgpr = [self.state._read_sgpr(i) for i in range(128)]
-    vgpr = [[self.state._read_vgpr(reg, lane) for reg in range(256)] for lane in range(WAVE_SIZE)]
-    # Convert actual PC address to word offset for comparison with Rust emulator
-    pc_offset = (self.state.pc - self.lib_addr) // 4 if self.state.pc != 0xFFFFFFFFFFFFFFFF else 0xFFFFFFFFFFFFFFFF
-    return StateSnapshot(pc=pc_offset, scc=self.state._read_sgpr(SCC.offset), vcc=sgpr[VCC_LO.offset],
-                         exec_mask=sgpr[EXEC_LO.offset], sgpr=sgpr, vgpr=vgpr)
-
-def run_single_kernel(kernel: bytes, n_lanes: int, args_ptr: int, global_size: tuple[int, int, int],
-                      local_size: tuple[int, int, int], max_steps: int, debug: bool, trace_len: int,
-                      kernel_idx: int = 0, max_workgroups: int = 8) -> tuple[bool, str, int]:
-  """Run a single kernel through both emulators. Returns (success, message, total_steps)."""
-  gx, gy, gz = global_size
-  lx, ly, lz = local_size
-  total_steps = 0
-  wg_count = 0
-
-  for gidz in range(gz):
-    for gidy in range(gy):
-      for gidx in range(gx):
-        if wg_count >= max_workgroups: return True, f"Completed {wg_count} workgroups (limit reached)", total_steps
-        wg_count += 1
-        rust = RustEmulator()
-        python = PythonEmulator()
-        rust.create(kernel, n_lanes)
-        python.create(kernel, n_lanes)
-
-        # Initialize LDS (64KB, standard size for AMD GPUs)
-        rust.init_lds(65536)
-
-        for emu in (rust, python):
-          emu.set_sgpr(0, args_ptr & 0xffffffff)
-          emu.set_sgpr(1, (args_ptr >> 32) & 0xffffffff)
-          emu.set_sgpr(13, gidx)
-          emu.set_sgpr(14, gidy)
-          emu.set_sgpr(15, gidz)
-          # Initialize v[0] with packed workitem IDs for each lane
-          for lane in range(n_lanes):
-            tid = lane
-            z, y, x = tid // (lx * ly), (tid // lx) % ly, tid % lx
-            emu.set_vgpr(lane, 0, (z << 20) | (y << 10) | x)
-
-        step = 0
-        trace: list[tuple[int, int, str, StateSnapshot, StateSnapshot]] = []
-        prev_sync_after = False  # Track if previous instruction had known Rust bugs
-        try:
-          while step < max_steps:
-            rust_before = rust.get_snapshot()
-            python_before = python.get_snapshot()
-
-            pc_addr = python.lib_addr + python_before.pc * 4  # Convert word offset to actual address
-            python._ensure_decoded(pc_addr)
-            inst_hex_name = python.program[pc_addr][0]
-            # Decode the instruction to get mnemonic for sync_after checks
-            try:
-              # Format is mnemonic_hexbytes, e.g. v_exp_f32_e32_014b027e -> hex is 014b027e
-              parts = inst_hex_name.rsplit('_', 1)
-              inst_bytes_hex = parts[1] if len(parts) == 2 else ""
-              inst_bytes = bytes.fromhex(inst_bytes_hex) if inst_bytes_hex else b''
-              decoded = decode_inst(inst_bytes) if inst_bytes else None
-              inst_mnemonic = repr(decoded).split('(')[0] if decoded else ""
-            except Exception:
-              inst_mnemonic = ""
-            # For generic instructions, use function name for sync_after check
-            if not inst_mnemonic: inst_mnemonic = inst_hex_name
-            inst_str = inst_hex_name
-            trace.append((step, python_before.pc, inst_str, rust_before, python_before))
-            if len(trace) > trace_len: trace.pop(0)
-
-            if debug: print(f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: PC={python_before.pc}, inst={inst_str}")
-
-            # Instructions with known Rust emulator bugs or precision differences - sync Python to Rust after execution
-            # v_div_scale/v_div_fixup: Rust has different VCC handling
-            # v_cvt_f16_f32: Rust clears high 16 bits, but hardware (and Python) preserves them
-            # s_add_i32/s_sub_i32: Rust has incorrect SCC overflow detection
-            # v_exp_f32/v_log_f32/v_ldexp_f32: precision differences in transcendental functions
-            # s_delay_alu: Rust handles differently
-            # v_add_co_ci_u32/v_sub_co_ci_u32/v_subrev_co_ci_u32: Rust preserves inactive VCC bits, but hardware clears all bits
-            sync_after = any(x in inst_mnemonic.lower() for x in ('v_div_scale', 'v_div_fixup', 'v_cvt_f16_f32', 's_add_i32', 's_sub_i32',
-                                                                   'v_exp_f32', 'v_log_f32', 'v_ldexp_f32', 's_delay_alu',
-                                                                   'v_add_co_ci_u32', 'v_sub_co_ci_u32', 'v_subrev_co_ci_u32'))
-            # Skip comparison if previous instruction had known Rust bugs (states were synced but may still differ slightly)
-            diffs = rust_before.diff(python_before, n_lanes) if not prev_sync_after else []
-            if diffs:
-              trace_lines = []
-              for idx, (s, pc, d, rb, pb) in enumerate(trace):
-                trace_lines.append(f"    step {s}: PC={pc:3d} {d}")
-                if idx < len(trace) - 1:
-                  next_rb, next_pb = trace[idx + 1][3:5]
-                  rust_diffs = rb.diff(next_rb, n_lanes, "->")
-                  python_diffs = pb.diff(next_pb, n_lanes, "->")
-                  if rust_diffs: trace_lines.append(f"             rust:   {', '.join(rust_diffs[:5])}")
-                  if python_diffs: trace_lines.append(f"             python: {', '.join(python_diffs[:5])}")
-                  elif rust_diffs: trace_lines.append("             python: (no changes)")
-                else:
-                  # Last traced instruction - compare with current state
-                  rust_diffs = rb.diff(rust_before, n_lanes, "->")
-                  python_diffs = pb.diff(python_before, n_lanes, "->")
-                  if rust_diffs: trace_lines.append(f"             rust:   {', '.join(rust_diffs[:5])}")
-                  if python_diffs: trace_lines.append(f"             python: {', '.join(python_diffs[:5])}")
-                  elif rust_diffs: trace_lines.append("             python: (no changes)")
-              trace_str = "\n".join(trace_lines)
-              msg = f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step} before inst '{inst_str}': states differ (rust vs python):\n  "
-              msg += "\n  ".join(diffs[:10]) + f"\n  Recent instructions:\n{trace_str}"
-              return False, msg, total_steps
-
-            rust_result = rust.step()
-            python_result = python.step()
-
-            if rust_result != python_result:
-              # Rust returns 1 for unsupported instructions - skip test
-              if rust_result == 1 and python_result == 0:
-                raise unittest.SkipTest(f"Rust emulator doesn't support instruction: {inst_str}")
-              trace_str = "\n".join(f"    step {s}: PC={pc:3d} {d}" for s, pc, d, _, _ in trace)
-              msg = (f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: different return codes: "
-                     f"rust={rust_result}, python={python_result}, inst={inst_str}\n  Recent instructions:\n{trace_str}")
-              return False, msg, total_steps
-
-            # Sync Python state to Rust after instructions with known Rust emulator differences
-            if sync_after:
-              rust_after = rust.get_snapshot()
-              for i in range(128): python.set_sgpr(i, rust_after.sgpr[i])
-              for lane in range(n_lanes):
-                for i in range(256): python.set_vgpr(lane, i, rust_after.vgpr[lane][i])
-              assert python.state is not None
-              # Convert Rust's word-based PC to Python's actual address
-              python.state.pc = python.lib_addr + rust_after.pc * 4
-              python.state._write_sgpr(SCC.offset, rust_after.scc)
-              python.state._write_sgpr(VCC_LO.offset, rust_after.vcc)
-              python.state._write_sgpr(EXEC_LO.offset, rust_after.exec_mask)
-            prev_sync_after = sync_after
-
-            if rust_result == -1:
-              total_steps += step + 1
-              break
-            if rust_result == 1:
-              total_steps += step + 1
-              break
-            if rust_result < 0 and rust_result != -2:
-              return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Step {step}: error code {rust_result}", total_steps
-
-            step += 1
-          else:
-            return False, f"K{kernel_idx} WG({gidx},{gidy},{gidz}) Max steps ({max_steps}) reached", total_steps
-        finally:
-          rust.free()
-
-  return True, f"Completed {gx*gy*gz} workgroups", total_steps
-
-def compare_emulators_multi_kernel(kernels: list[KernelSnapshot], buf_pool: dict[int, int], max_steps: int = 1000,
-                                    debug: bool = False, trace_len: int = 10, buf_data: dict[int, bytes] | None = None) -> tuple[bool, str]:
-  """Run all kernels through both emulators with shared buffer pool."""
-  if buf_data is None: buf_data = {}
-
-  # Allocate shared buffer pool with padding for over-reads (GPU loads up to 16 bytes at once)
-  buf_id_to_ptr: dict[int, int] = {}
-  buffers = []
-  for buf_id, size in buf_pool.items():
-    padded_size = ((size + 15) // 16) * 16 + 16  # round up to 16 bytes + extra padding
-    # Initialize with data from COPY if available
-    init_data = buf_data.get(buf_id, b'\x00' * padded_size)
-    init_list = list(init_data) + [0] * (padded_size - len(init_data))
-    buf = (ctypes.c_uint8 * padded_size)(*init_list[:padded_size])
-    buffers.append((buf, padded_size))
-    buf_id_to_ptr[buf_id] = ctypes.addressof(buf)
-
-  # Set up valid memory ranges
-  ranges = {(ctypes.addressof(b), size) for b, size in buffers}
-
-  total_steps = 0
-  for ki, kernel in enumerate(kernels):
-    # Create args array for this kernel's buffers
-    args = (ctypes.c_uint64 * len(kernel.buf_idxs))(*[buf_id_to_ptr[bid] for bid in kernel.buf_idxs])
-    args_ptr = ctypes.addressof(args)
-
-    # Update valid ranges to include this args array
-    kernel_ranges = ranges | {(args_ptr, ctypes.sizeof(args))}
-    set_valid_mem_ranges(kernel_ranges)
-
-    n_lanes = kernel.local_size[0] * kernel.local_size[1] * kernel.local_size[2]
-
-    ok, msg, steps = run_single_kernel(
-      kernel.code, min(n_lanes, 32), args_ptr, kernel.global_size,
-      kernel.local_size, max_steps, debug, trace_len, ki
-    )
-    total_steps += steps
-    if not ok:
-      return False, msg
-
-  return True, f"Completed {len(kernels)} kernels, {total_steps} total steps"
-
-def compare_emulators_with_memory(kernel: bytes, n_lanes: int, buf_sizes: list, max_steps: int = 1000, debug: bool = False,
-                                   global_size: tuple[int, int, int] = (1, 1, 1), trace_len: int = 10) -> tuple[bool, str]:
-  """Run both emulators with memory set up for tinygrad kernels, executing all workgroups. Legacy wrapper."""
-  # Allocate buffers
-  buffers = []
-  for size in buf_sizes:
-    buf = (ctypes.c_uint8 * size)(*[0] * size)
-    buffers.append(buf)
-
-  # Create args array with buffer pointers
-  args = (ctypes.c_uint64 * len(buffers))(*[ctypes.addressof(b) for b in buffers])
-  args_ptr = ctypes.addressof(args)
-
-  # Set up valid memory ranges for Python emulator
-  ranges = {(ctypes.addressof(b), len(b)) for b in buffers}
-  ranges.add((args_ptr, ctypes.sizeof(args)))
-  set_valid_mem_ranges(ranges)
-
-  # Legacy wrapper assumes local_size = (n_lanes, 1, 1)
-  ok, msg, _ = run_single_kernel(kernel, n_lanes, args_ptr, global_size, (n_lanes, 1, 1), max_steps, debug, trace_len)
-  return ok, msg
-
-def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelSnapshot], dict[int, int], dict[int, bytes]]:
-  """Compile a tinygrad operation and extract all kernels with their buffer mappings."""
-  from tinygrad import Tensor
-  from tinygrad.runtime.support.elf import elf_loader
-
-  out = op_fn(Tensor)
-  sched = out.schedule()
-  kernels = []
-  buf_pool: dict[int, int] = {}  # buffer id -> size
-  buf_data: dict[int, bytes] = {}  # buffer id -> initial data from COPY
-
-  for ei in sched:
-    lowered = ei.lower()
-    if ei.ast.op.name == 'COPY':
-      # Handle COPY: extract source data to initialize destination buffer
-      if len(lowered.bufs) >= 2:
-        dst_buf, src_buf = lowered.bufs[0], lowered.bufs[1]
-        dst_id = id(dst_buf)
-        if dst_id not in buf_pool:
-          buf_pool[dst_id] = dst_buf.nbytes
-        # Get source data if it's from numpy/CPU
-        if hasattr(src_buf, 'base') and src_buf.base is not None and hasattr(src_buf.base, '_buf'):
-          src_data = bytes(src_buf.base._buf)
-          buf_data[dst_id] = src_data
-    elif ei.ast.op.name == 'SINK':
-      if lowered.prg and lowered.prg.p.lib:
-        lib = bytes(lowered.prg.p.lib)
-        _, sections, _ = elf_loader(lib)
-        for sec in sections:
-          if sec.name == '.text':
-            buf_idxs = []
-            buf_sizes = []
-            for b in lowered.bufs:
-              buf_id = id(b)
-              if buf_id not in buf_pool:
-                buf_pool[buf_id] = b.nbytes
-              buf_idxs.append(buf_id)
-              buf_sizes.append(b.nbytes)
-            kernels.append(KernelSnapshot(
-              code=bytes(sec.content),
-              src=lowered.prg.p.src,
-              global_size=tuple(lowered.prg.p.global_size),
-              local_size=tuple(lowered.prg.p.local_size),
-              buf_idxs=buf_idxs,
-              buf_sizes=buf_sizes
-            ))
-  if not kernels: raise RuntimeError("No kernel found")
-  return kernels, buf_pool, buf_data
-
-def get_kernel_from_tinygrad(op_fn) -> tuple[bytes, tuple[int, int, int], tuple[int, int, int], list]:
-  """Compile a tinygrad operation and extract the last (main) kernel binary. Legacy wrapper."""
-  kernels, _, _ = get_kernels_from_tinygrad(op_fn)
-  k = kernels[-1]
-  return k.code, k.global_size, k.local_size, k.buf_sizes
-
-@unittest.skipUnless(Device.DEFAULT == "AMD", "requires AMD device")
-class TestTinygradKernels(unittest.TestCase):
-  """Compare emulators on real tinygrad-compiled kernels."""
-
-  def _test_kernel(self, op_fn, max_steps=10000):
-    kernels, buf_pool, buf_data = get_kernels_from_tinygrad(op_fn)
-    ok, msg = compare_emulators_multi_kernel(kernels, buf_pool, max_steps=max_steps, buf_data=buf_data)
-    self.assertTrue(ok, msg)
-
-  # Basic ops - consolidated tests covering key instruction patterns
-  def test_unary_ops(self): self._test_kernel(lambda T: T([-1.0, 0.0, 1.0, 2.0]).relu().exp().log().sqrt().reciprocal())
-  def test_binary_ops(self): self._test_kernel(lambda T: (T([1.0, 2.0]) + T([3.0, 4.0])) * T([0.5, 0.5]) - T([1.0, 1.0]))
-  def test_trig(self): self._test_kernel(lambda T: T([0.1, 1.0, 3.14, -1.0]*8).sin() + T([0.1, 1.0, 3.14, -1.0]*8).cos())
-  def test_compare(self): self._test_kernel(lambda T: (T.empty(64) < T.empty(64)).where(T.empty(64), T.empty(64)))
-  def test_bitwise(self): self._test_kernel(lambda T: (T([0xF0, 0x0F, 0xFF]*11).int() & T([0x0F, 0x0F, 0x00]*11).int()) | T([1]*33).int())
-  def test_int_ops(self): self._test_kernel(lambda T: ((T.empty(64).int() + T.empty(64).int()) * T.empty(64).int()).float())
-
-  # Reductions
-  def test_reduce(self): self._test_kernel(lambda T: T.empty(64).sum() + T.empty(64).max())
-  def test_argmax(self): self._test_kernel(lambda T: T.empty(64).argmax())
-
-  # Matmul
-  def test_gemm(self): self._test_kernel(lambda T: T.empty(8, 8) @ T.empty(8, 8), max_steps=100000)
-  @unittest.skip("Rust emulator crashes on this kernel (assertion failure in thread.rs)")
-  def test_gemm_fp16(self): self._test_kernel(lambda T: T.empty(16, 16).half() @ T.empty(16, 16).half(), max_steps=100000)
-
-  # Complex ops
-  def test_softmax(self): self._test_kernel(lambda T: T.empty(16).softmax())
-  def test_layernorm(self): self._test_kernel(lambda T: T.empty(8, 8).layernorm())
-
-  # Memory patterns
-  def test_memory(self): self._test_kernel(lambda T: T.empty(4, 4).permute(1, 0).contiguous() + T.empty(4, 1).expand(4, 4))
-
-  # Cast ops
-  def test_cast(self): self._test_kernel(lambda T: T.empty(32).half().float() + T.empty(32).int().float())
-
-  # Pooling - regression for VCC wave32 mode
-  def test_pool2d(self):
-    self._test_kernel(lambda T: T.empty(1, 1, 8, 8).avg_pool2d(kernel_size=(4,4)) + T.empty(1, 1, 8, 8).max_pool2d(kernel_size=(4,4)))
-
-  # Convolution
-  def test_conv2d(self): self._test_kernel(lambda T: T.empty(1, 2, 8, 8).conv2d(T.empty(2, 2, 3, 3)), max_steps=50000)
-
-  # Regression tests
-  def test_topk(self): self._test_kernel(lambda T: T.empty(64).topk(3)[0])
-  def test_interpolate(self): self._test_kernel(lambda T: T.empty(1,2,16,16).relu().cast('uint8').interpolate((8,8), mode="linear"))
-  def test_index_int64(self):
-    from tinygrad import dtypes
-    self._test_kernel(lambda T: T.empty(4, 4)[T.arange(4).cast(dtypes.int64), :])
-  def test_gelu(self): self._test_kernel(lambda T: T.empty(32, 32).gelu())
-  def test_exp(self): self._test_kernel(lambda T: T.empty(1024).exp())
-  def test_cross_entropy(self):
-    import numpy as np
-    np.random.seed(0)
-    classes = np.random.randint(0, 10, (16,), dtype=np.int32).tolist()
-    x_np = np.random.randn(16, 10).astype(np.float32)
-    self._test_kernel(lambda T: (T(x_np.tolist()).reshape(16,10) + 0).cross_entropy((T(classes).int().reshape(16) + 0)))
-  def test_isinf(self): self._test_kernel(lambda T: T([float('-inf'), 0., float('inf'), 1.1]*8).isinf())
-  def test_sin_f64(self):
-    from tinygrad import dtypes
-    self._test_kernel(lambda T: T([2.0], dtype=dtypes.float64).sin())
-
-  def test_sin_large_f32(self):
-    """Test sin with large values that trigger Payne-Hanek range reduction."""
-    # Values around 859240 trigger the Payne-Hanek algorithm
-    # This tests the integer multiply-high instructions used in range reduction
-    self._test_kernel(lambda T: T([859240.0, 1000000.0, 100594688.0]).sin())
-
-  def test_clip_zero_one(self):
-    """Test clip(0, 1) - regression for binary_crossentropy failure."""
-    import numpy as np
-    np.random.seed(0)
-    x_np = np.random.uniform(-2, 2, (32, 10)).astype(np.float32).tolist()
-    self._test_kernel(lambda T: T(x_np).clip(0, 1))
-
-  def test_mod_int64(self):
-    """Test int64 modulo, especially edge cases like 1 % -1."""
-    from tinygrad import dtypes
-    self._test_kernel(lambda T: T([1, 10, -10, 7], dtype=dtypes.int64) % T([-1, 3, 3, -3], dtype=dtypes.int64))
-
-  def test_expand_flatten_sum(self):
-    """Test flatten of expanded tensor followed by sum.
-
-    Bug: flatten() of an expanded tensor produces wrong results for certain sizes.
-    Sizes that are multiples of 32 work (32, 48, 64), but sizes like 33, 49, 50 fail.
-    This breaks masked_select and nonzero operations.
-    """
-    import numpy as np
-    np.random.seed(0)
-    x_np = np.random.uniform(-2, 2, (33,)).astype(np.float32)
-    self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).unsqueeze(-1).expand(33, 3).flatten().sum())
-
-  @unittest.skip("slow and broken with AMD:LLVM")
-  def test_nonzero(self):
-    """Test nonzero operation - counts and gathers indices of non-zero elements."""
-    import numpy as np
-    np.random.seed(42)
-    x_np = np.random.rand(10, 5, 3).astype(np.float32)
-    self._test_kernel(lambda T: (T(x_np.tolist()) > 0.5).nonzero())
-
-  @unittest.skip("Precision differences in v_exp/v_log accumulate across kernels, causing memory divergence")
-  def test_softmax_argmax_fused(self):
-    """Test fused softmax+argmax - tracks exp2 precision issue.
-
-    The fused kernel recomputes softmax inline and Python emulator's exp2 polynomial
-    has up to 1 ULP error vs native exp2f, causing accumulated differences.
-    """
-    import torch
-    torch.manual_seed(0)
-    x_np = torch.rand(4, 10).numpy()
-    self._test_kernel(lambda T: T(x_np.tolist()).softmax(1).argmax())
-
-if __name__ == "__main__":
-  unittest.main()
--- a/test/amd/test_mockgpu_invalid.py
+++ b/test/amd/test_mockgpu_invalid.py
@ -39,7 +39,6 @@ dev.synchronize()
    env = os.environ.copy()
    env["AMD"] = "1"
    env["MOCKGPU"] = "1"
-    env["PYTHON_REMU"] = "1"
    env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"

    st = time.perf_counter()
--- a/test/amd/test_roundtrip.py
+++ b/test/amd/test_roundtrip.py
@ -1,6 +1,7 @@
 #!/usr/bin/env python3
 """Roundtrip tests: generate tinygrad kernels, decode instructions, re-encode, verify match."""
 import unittest, io, sys, re
+from dataclasses import dataclass
 from tinygrad import Device
 from tinygrad.renderer.amd import detect_format
 from test.amd.helpers import llvm_assemble, llvm_disasm, get_target, get_mattr
@ -44,6 +45,64 @@ def compile_and_disasm_batch(instrs: list[str], arch: str = 'rdna3') -> list[str
  code = b''.join(llvm_assemble(instrs, mcpu, mattr))
  return llvm_disasm(code, mcpu, mattr)[:len(instrs)]

+@dataclass
+class KernelSnapshot:
+  code: bytes
+  src: str
+  global_size: tuple[int, int, int]
+  local_size: tuple[int, int, int]
+  buf_idxs: list[int]  # indices into shared buffer pool
+  buf_sizes: list[int]  # sizes for each buffer index
+
+def get_kernels_from_tinygrad(op_fn) -> tuple[list[KernelSnapshot], dict[int, int], dict[int, bytes]]:
+  """Compile a tinygrad operation and extract all kernels with their buffer mappings."""
+  from tinygrad import Tensor
+  from tinygrad.runtime.support.elf import elf_loader
+
+  out = op_fn(Tensor)
+  sched = out.schedule()
+  kernels = []
+  buf_pool: dict[int, int] = {}  # buffer id -> size
+  buf_data: dict[int, bytes] = {}  # buffer id -> initial data from COPY
+
+  for ei in sched:
+    lowered = ei.lower()
+    if ei.ast.op.name == 'COPY':
+      # Handle COPY: extract source data to initialize destination buffer
+      if len(lowered.bufs) >= 2:
+        dst_buf, src_buf = lowered.bufs[0], lowered.bufs[1]
+        dst_id = id(dst_buf)
+        if dst_id not in buf_pool:
+          buf_pool[dst_id] = dst_buf.nbytes
+        # Get source data if it's from numpy/CPU
+        if hasattr(src_buf, 'base') and src_buf.base is not None and hasattr(src_buf.base, '_buf'):
+          src_data = bytes(src_buf.base._buf)
+          buf_data[dst_id] = src_data
+    elif ei.ast.op.name == 'SINK':
+      if lowered.prg and lowered.prg.p.lib:
+        lib = bytes(lowered.prg.p.lib)
+        _, sections, _ = elf_loader(lib)
+        for sec in sections:
+          if sec.name == '.text':
+            buf_idxs = []
+            buf_sizes = []
+            for b in lowered.bufs:
+              buf_id = id(b)
+              if buf_id not in buf_pool:
+                buf_pool[buf_id] = b.nbytes
+              buf_idxs.append(buf_id)
+              buf_sizes.append(b.nbytes)
+            kernels.append(KernelSnapshot(
+              code=bytes(sec.content),
+              src=lowered.prg.p.src,
+              global_size=tuple(lowered.prg.p.global_size),
+              local_size=tuple(lowered.prg.p.local_size),
+              buf_idxs=buf_idxs,
+              buf_sizes=buf_sizes
+            ))
+  if not kernels: raise RuntimeError("No kernel found")
+  return kernels, buf_pool, buf_data
+
@unittest.skipUnless(Device.DEFAULT == "AMD", "requires AMD device")
 class TestTinygradKernelRoundtrip(unittest.TestCase):
  """Test roundtrip on real tinygrad-generated kernels using get_kernels_from_tinygrad pattern."""
@ -57,7 +116,6 @@ class TestTinygradKernelRoundtrip(unittest.TestCase):
    """
    arch = self.arch

-    from test.amd.test_compare_emulators import get_kernels_from_tinygrad
    from tinygrad.runtime.support.elf import elf_loader
    from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
    from tinygrad.helpers import DEV
--- a/test/mockgpu/amd/README
+++ b/test/mockgpu/amd/README
@ -4,7 +4,7 @@ Test with `pytest -n12 test/amd/`
 `DEV=AMD:LLVM pytest -n12 test/amd/`

 * dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
-* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1 PYTHON_REMU=1`
+* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1`
 * generate.py -- extract assembly format + instruction pseudocode from AMD XML + PDF
 * test/mockgpu/amd/pcode.py -- pseudocode to UOp transformation
 * sqtt.py -- SQTT parser
@ -20,20 +20,19 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same

 tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.

-`DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
-`DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
+`DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
+`DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`

 The ops tests also pass, but they are very slow, so you should run them one at a time.

-`SKIP_SLOW_TEST=1 DEV=AMD PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
-`SKIP_SLOW_TEST=1 DEV=AMD:LLVM PYTHON_REMU=1 MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
+`SKIP_SLOW_TEST=1 DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
+`SKIP_SLOW_TEST=1 DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`

 When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
 While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
-If a test is failing with `DEV=AMD PYTHON_REMU=1 MOCKGPU=1` it's because an instruction is emulated incorrectly.
+If a test is failing with `DEV=AMD MOCKGPU=1` it's because an instruction is emulated incorrectly.
 You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
 IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.

 Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.
 Get line count with `cloc --by-file tinygrad/renderer/amd/*.py`
-
--- a/test/mockgpu/amd/amdgpu.py
+++ b/test/mockgpu/amd/amdgpu.py
@ -1,7 +1,7 @@
 import ctypes, time
 from dataclasses import replace
 from test.mockgpu.gpu import VirtGPU
-from test.mockgpu.helpers import _try_dlopen_remu
+from test.mockgpu.helpers import PythonRemu
 from tinygrad.helpers import getbits, to_mv, getenv, DEV
 from tinygrad.runtime.support import c

@ -41,7 +41,7 @@ WAIT_REG_MEM_FUNCTION_EQ  = 3 # ==
 WAIT_REG_MEM_FUNCTION_NEQ = 4 # !=
 WAIT_REG_MEM_FUNCTION_GEQ = 5 # >=

-remu = _try_dlopen_remu()
+remu = PythonRemu()

 def create_sdma_packets():
  # TODO: clean up this, if we want to keep it
@ -212,13 +212,13 @@ class PM4Executor(AMDQueue):
    scratch_size = wavesize * (16 if self.gpu.arch == "cdna" else 4)  # per-thread scratch size in bytes

    assert prg_sz > 0, "Invalid prg ptr (not found in mapped ranges)"
-    # Pass valid memory ranges, rsrc2, scratch_size, arch, and user data registers to Python emulator
-    if hasattr(remu, 'valid_mem_ranges'): remu.valid_mem_ranges = self.gpu.mapped_ranges
-    if hasattr(remu, 'rsrc2'): remu.rsrc2 = rsrc2
-    if hasattr(remu, 'scratch_size'): remu.scratch_size = scratch_size
-    if hasattr(remu, 'arch'): remu.arch = self.gpu.arch
-    if hasattr(remu, 'user_data'): remu.user_data = user_data
-    err = remu.run_asm(prg_addr, prg_sz, *gl, *lc, args_addr)
+    # Pass valid memory ranges, rsrc2, scratch_size, arch, and user data registers to the emulator
+    remu.valid_mem_ranges = self.gpu.mapped_ranges
+    remu.rsrc2 = rsrc2
+    remu.scratch_size = scratch_size
+    remu.arch = self.gpu.arch
+    remu.user_data = user_data
+    err = remu.run_asm(prg_addr, prg_sz, gl[0], gl[1], gl[2], lc[0], lc[1], lc[2], args_addr)
    if err != 0: raise RuntimeError("remu does not support the new instruction introduced in this kernel")

  def _exec_indirect_buffer(self, n):
--- a/test/mockgpu/amd/emu.py
+++ b/test/mockgpu/amd/emu.py
@ -233,7 +233,6 @@ VOPD_TO_VOP2 = {
  ir4.VOPDOp.V_DUAL_DOT2ACC_F32_F16: ir3.VOP2Op.V_DOT2ACC_F32_F16_E32,
 }
 def _wave_size(arch: str) -> int: return 64 if arch.startswith("cdna") else 32
-WAVE_SIZE = 32  # default wave size for RDNA (exported for test_compare_emulators)
 # Special registers stored after inline constants (256-259)
 PC_LO_IDX, PC_HI_IDX, SCRATCH_STRIDE_IDX = 256, 257, 259
 # SGPR buffer: 0-127 = SGPRs, 128-255 = inline constants, 256-259 = special registers
--- a/test/mockgpu/helpers.py
+++ b/test/mockgpu/helpers.py
@ -1,5 +1,4 @@
 import ctypes, ctypes.util
-from tinygrad.helpers import getenv

 def _try_dlopen_gpuocelot():
  GPUOCELOT_PATHS = [ctypes.util.find_library("gpuocelot")] if ctypes.util.find_library("gpuocelot") is not None else []
@ -16,7 +15,7 @@ def _try_dlopen_gpuocelot():
  return None

 class PythonRemu:
-  """Python RDNA3/RDNA4 emulator wrapper that matches the libremu.so interface."""
+  """Python RDNA3/RDNA4 emulator wrapper used by mockgpu."""
  valid_mem_ranges: set[tuple[int, int]] = set()
  rsrc2: int = 0x19c  # Default: USER_SGPR_COUNT=14, enable X and Y workgroup IDs
  scratch_size: int = 0  # private_segment_fixed_size from kernel descriptor
@ -26,20 +25,3 @@ class PythonRemu:
  def run_asm(self, lib: int, lib_sz: int, gx: int, gy: int, gz: int, lx: int, ly: int, lz: int, args_ptr: int) -> int:
    from test.mockgpu.amd.emu import run_asm
    return run_asm(lib, lib_sz, gx, gy, gz, lx, ly, lz, args_ptr, self.rsrc2, self.scratch_size, self.arch, self.user_data)
-
-def _try_dlopen_remu():
-  # Use Python emulator only if PYTHON_REMU=1
-  if int(getenv("PYTHON_REMU", "1")):
-    return PythonRemu()
-  REMU_PATHS = ["extra/remu/target/release/libremu.so", "libremu.so", "/usr/local/lib/libremu.so",
-               "extra/remu/target/release/libremu.dylib", "libremu.dylib", "/usr/local/lib/libremu.dylib", "/opt/homebrew/lib/libremu.dylib"]
-  for path in REMU_PATHS:
-    try:
-      remu = ctypes.CDLL(path)
-      remu.run_asm.restype = ctypes.c_int32
-      remu.run_asm.argtypes = [ctypes.c_void_p, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32,
-        ctypes.c_uint32, ctypes.c_uint32, ctypes.c_uint32, ctypes.c_void_p]
-    except OSError: pass
-    else: return remu
-  print("Could not find libremu.so")
-  return None