MOCKGPU interfaces (#15796)

2026-06-24 02:14:17 +00:00 · 2026-04-17 18:56:29 -07:00 · 2026-04-17 18:56:29 -07:00 · 6adf4c3cd9
commit 6adf4c3cd9
parent 8da308573f
29 changed files with 94 additions and 95 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -643,8 +643,7 @@ jobs:
    runs-on: ubuntu-24.04
    timeout-minutes: 20
    env:
-      DEV: AMD
-      MOCKGPU: 1
+      DEV: MOCKKFD+AMD
    steps:
      - name: Checkout Code
        uses: actions/checkout@v6
@ -670,7 +669,7 @@ jobs:
      - name: Run AMD renderer tests
        run: python -m pytest -n=auto test/amd/ --durations 20
      - name: Run AMD renderer tests (AMD:LLVM)
-        run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
+        run: DEV=MOCKKFD+AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
      - name: Run SQTT profiling tests
        run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
      - name: Run AMD emulated tests on NULL backend
@ -679,20 +678,19 @@ jobs:
        run: |
          PYTHONPATH=. DEV=NULL::gfx1100 python extra/mmapeak/mmapeak.py
          PYTHONPATH=. DEV=NULL::gfx1201 python3 -m pytest -n=auto test/testextra/test_tk.py test/backend/test_asm_gemm.py
-      - name: Run matmul on MOCKGPU
+      - name: Run matmul on MOCKKFD
        run: |
-          PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
-          PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_copy_matmul.py
+          PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_asm_matmul.py
+          PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_copy_matmul.py
      - name: Run LLVM test
-        run: DEV=AMD:LLVM python test/device/test_amd_llvm.py
+        run: DEV=MOCKKFD+AMD:LLVM python test/device/test_amd_llvm.py

  testmockam:
    name: Linux (am)
    runs-on: ubuntu-24.04
    timeout-minutes: 15
    env:
-      DEV: PCI+AMD
-      MOCKGPU: 1
+      DEV: MOCKPCI+AMD
    steps:
      - name: Checkout Code
        uses: actions/checkout@v6
@ -704,13 +702,13 @@ jobs:
          amd: 'true'
      - name: Run test_tiny on MOCKAM
        run: python test/test_tiny.py
-      - name: Run test_tiny on MOCKAM USB
-        run: GMMU=0 DEV=USB+AMD python test/test_tiny.py
-      - name: Run test_hcq on MOCKAM
+      - name: Run test_tiny on MOCKUSB
+        run: GMMU=0 DEV=MOCKUSB+AMD python test/test_tiny.py
+      - name: Run test_hcq on MOCKPCI
        run: python -m pytest test/device/test_hcq.py
-      - name: Run disk copy tests on MOCKAM
+      - name: Run disk copy tests on MOCKPCI
        run: python -m pytest test/unit/test_disk_tensor.py -k test_copy_from_disk
-      - name: Run test_tiny on MOCKAM Remote
+      - name: Run test_tiny on MOCKPCI Remote
        run: |
          python extra/remote/serve.py 6667 &
          sleep 2
@ -728,8 +726,7 @@ jobs:
    runs-on: ubuntu-22.04
    timeout-minutes: 15
    env:
-      DEV: AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
-      MOCKGPU: 1
+      DEV: MOCKKFD+AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
      SKIP_SLOW_TEST: 1
    steps:
      - name: Checkout Code
@ -764,7 +761,6 @@ jobs:
    runs-on: ubuntu-22.04
    timeout-minutes: 20
    env:
-      MOCKGPU: 1
      FORWARD_ONLY: 1
    steps:
      - name: Checkout Code
@ -777,7 +773,7 @@ jobs:
          cuda: 'true'
          ocelot: 'true'
      - name: Set env
-        run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
+        run: printf "${{ matrix.backend == 'ptx' && 'DEV=MOCK+CUDA:PTX' || matrix.backend == 'nv' && 'DEV=MOCKNVK+NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
      - name: Check Device.DEFAULT and print some source
        run: |
          python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
@ -862,22 +858,19 @@ jobs:
      run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
    - name: Run pytest (amd)
      env:
-        MOCKGPU: 1
-        DEV: AMD
+        DEV: MOCKKFD+AMD
        FORWARD_ONLY: 1
      run: |
        python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
    - name: Run pytest (amd with llvm backend)
      env:
-        MOCKGPU: 1
-        DEV: "AMD:LLVM"
+        DEV: "MOCKKFD+AMD:LLVM"
        FORWARD_ONLY: 1
      run: |
        python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
    - name: Run pytest (ptx)
      env:
-        MOCKGPU: 1
-        DEV: "NV:PTX"
+        DEV: "MOCKNVK+NV:PTX"
        FORWARD_ONLY: 1
        # TODO: failing due to library loading error
        CAPTURE_PROCESS_REPLAY: 0
--- a/docs/abstractions4.py
+++ b/docs/abstractions4.py
@ -1,9 +1,9 @@
 # tinygrad allows you to write kernels at many different abstractions levels.
 # This is for RDNA3, but if you don't have one you can run with the emulator
-# PYTHONPATH="." MOCKGPU=1 DEV=AMD
+# PYTHONPATH="." DEV=MOCKPCI+AMD

 from tinygrad import Tensor, Context, GlobalCounters, UOp, Device
-from tinygrad.helpers import DEBUG, getenv
+from tinygrad.helpers import DEV, DEBUG, getenv
 from tinygrad.uop.ops import AxisType, KernelInfo, Ops
 from tinygrad.dtype import AddrSpace, dtypes
 from tinygrad.runtime.autogen.amd.rdna3.ins import *
@ -16,7 +16,7 @@ def eval_harness(name, tensor, fxn, check=None):
  print(f"computed in {GlobalCounters.time_sum_s*1000:.2f} ms, {(a.nbytes()/1e9)/GlobalCounters.time_sum_s:.2f} GB/s")
  return out

-SZ = 256*1024 if getenv("MOCKGPU") else 1024*1024*1024
+SZ = 256*1024 if DEV.interface.startswith("MOCK") else 1024*1024*1024

 def example_2_hip(a:Tensor, correct):
  GLOBALS = 1024
--- a/test/amd/test_mockgpu_invalid.py
+++ b/test/amd/test_mockgpu_invalid.py
@ -37,8 +37,7 @@ dev.synchronize()
 '''

    env = os.environ.copy()
-    env["AMD"] = "1"
-    env["MOCKGPU"] = "1"
+    env["DEV"] = "MOCKKFD+AMD"
    env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"

    st = time.perf_counter()
--- a/test/amd/test_sqtt_encoder.py
+++ b/test/amd/test_sqtt_encoder.py
@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """Tests for SQTT encoder: verifies the emulator produces correct SQTT traces for known kernels.

-Run with: DEV=AMD MOCKGPU=1 python -m pytest test/amd/test_sqtt_encoder.py -v
+Run with: DEV=MOCKKFD+AMD python -m pytest test/amd/test_sqtt_encoder.py -v
 """
 import ctypes, unittest
 from tinygrad.helpers import Context
--- a/test/backend/test_asm_gemm.py
+++ b/test/backend/test_asm_gemm.py
@ -1,7 +1,7 @@
 import unittest
 from tinygrad import Tensor, Device, dtypes, Context
 from tinygrad.device import is_dtype_supported
-from tinygrad.helpers import getenv, system
+from tinygrad.helpers import getenv, system, DEV
 from extra.gemm.cdna_asm_gemm import asm_gemm
 from test.helpers import needs_second_gpu
 from examples.mlperf.models.flat_llama import FP8_DTYPE
@ -131,7 +131,7 @@ class TestGemmLlama(unittest.TestCase):
  dtype = dtypes.bfloat16

  def setUp(self):
-    if not is_cdna4() or getenv("MOCKGPU"):
+    if not is_cdna4() or DEV.interface.startswith("MOCK"):
      self.skipTest("very slow on non mi350x")

  def test_empty(self): asm_gemm(Tensor.empty(N:=getenv("N", 4096), N, dtype=self.dtype), Tensor.empty(N, N, dtype=self.dtype)).realize()
--- a/test/backend/test_dtype_alu.py
+++ b/test/backend/test_dtype_alu.py
@ -1,7 +1,7 @@
 import unittest, operator, math
 from tinygrad import Context, Tensor, dtypes, Device
 from tinygrad.dtype import DType, truncate, fp8_to_float
-from tinygrad.helpers import CI, EMULATED_DTYPES, getenv
+from tinygrad.helpers import CI, EMULATED_DTYPES, DEV, getenv
 from tinygrad.tensor import _to_np_dtype
 from tinygrad.device import is_dtype_supported
 from tinygrad.runtime.ops_python import from_storage_scalar
@ -32,7 +32,8 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), (Tensor.sin, np.
 #binary_operations.append(operator.truediv)

 # TODO: CI CUDA segfaults on sin, WEBGPU and NIR sines are not precise enough for large numbers
-if (getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}) or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer):
+if ((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"})
+    or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer)):
  unary_operations.remove((Tensor.sin, np.sin))
  unary_operations.remove((Tensor.cos, np.cos))

--- a/test/backend/test_edgecases.py
+++ b/test/backend/test_edgecases.py
@ -27,10 +27,10 @@ import numpy as np
 import torch
 from tinygrad import Tensor, dtypes, nn
 from tinygrad.device import Device
-from tinygrad.helpers import getenv
+from tinygrad.helpers import DEV
 from tinygrad.renderer.nir import NIRRenderer

-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")

 class TestNaNEdgeCases(unittest.TestCase):
  # we don't need more of these. it's unclear if torch's behavior is desired here
--- a/test/backend/test_interop.py
+++ b/test/backend/test_interop.py
@ -3,12 +3,12 @@ import unittest
 import torch
 import numpy as np

-from tinygrad.helpers import getenv, CI
+from tinygrad.helpers import CI, DEV
 from tinygrad.tensor import Tensor
 from tinygrad.device import Device
 from tinygrad.dtype import _from_torch_dtype, _to_torch_dtype

-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")

@unittest.skipIf(Device.DEFAULT not in ["METAL", "CUDA"] or MOCKGPU, f"no support on {Device.DEFAULT}")
 class TestInterop(unittest.TestCase):
--- a/test/backend/test_jit.py
+++ b/test/backend/test_jit.py
@ -8,7 +8,7 @@ from tinygrad.tensor import Tensor
 from tinygrad.engine.jit import TinyJit, JitError, GraphRunner, MultiGraphRunner, graph_class
 from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
 from tinygrad.device import Device
-from tinygrad.helpers import Context, JIT, GlobalCounters, getenv
+from tinygrad.helpers import Context, JIT, DEV, GlobalCounters
 from tinygrad.dtype import dtypes
 from extra.models.unet import ResBlock

@ -812,7 +812,7 @@ class TestJitGraphSplit(unittest.TestCase):
      hcqgraph=[self.ji_graph(6)])

  @unittest.skip("this fails if you don't have SDMA or are using AMD_DISABLE_SDMA=1")
-  @unittest.skipIf(getenv("MOCKGPU"), "MockGPU does not support parallel copies")
+  @unittest.skipIf(DEV.interface.startswith("MOCK"), "MockGPU does not support parallel copies")
  def test_jit_multidev_copy(self):
    if Device.DEFAULT in {"CPU"}: raise unittest.SkipTest("CPU/LLVM is not a valid default device for this test (zero-copies)")

--- a/test/backend/test_linearizer.py
+++ b/test/backend/test_linearizer.py
@ -7,12 +7,12 @@ from tinygrad.uop.ops import UOp, Ops, GroupOp, AxisType
 from tinygrad.device import Device, Buffer, is_dtype_supported
 from tinygrad.tensor import Tensor, _to_np_dtype
 from tinygrad.engine.realize import run_schedule, CompiledRunner, get_program
-from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, getenv
+from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, DEV
 from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
 from tinygrad.renderer.ptx import PTXRenderer
 from tinygrad.renderer.cstyle import CUDARenderer
 from test.helpers import replace_opts
-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")

 from tinygrad.uop.ops import print_uops # noqa: F401 # pylint: disable=unused-import

--- a/test/backend/test_ops.py
+++ b/test/backend/test_ops.py
@ -94,7 +94,7 @@ def prepare_test_op(low, high, shps, vals, forward_only=False):
 class TestOps(unittest.TestCase):

  def helper_test_exception(self, shps, torch_fxn, tinygrad_fxn=None, expected=None, forward_only=False, exact=False, vals=None, low=-1.5, high=1.5):
-    if getenv("MOCKGPU") and Device.DEFAULT == "NV": self.skipTest('helper_test_exception fails in CI CUDA')
+    if DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV": self.skipTest('helper_test_exception fails in CI CUDA')
    ts, tst = prepare_test_op(low, high, shps, vals, forward_only)
    if tinygrad_fxn is None:
      tinygrad_fxn = torch_fxn
@ -877,7 +877,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(45,65)], lambda x: x.sin())
    helper_test_op([()], lambda x: x.sin())
    # works on real CUDA but not CI
-    if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
+    if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
      helper_test_op(None, lambda x: x.sin(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
      helper_test_op(None, lambda x: x.sin(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@ -886,7 +886,7 @@ class TestOps(unittest.TestCase):
  def test_cos(self):
    helper_test_op([(45,65)], lambda x: x.cos())
    helper_test_op([()], lambda x: x.cos())
-    if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
+    if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
      helper_test_op(None, lambda x: x.cos(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
      helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@ -897,7 +897,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(45,65)], lambda x: x.tan(), low=-1.5, high=1.5)
    helper_test_op([(45,65)], lambda x: x.tan(), low=-5, high=5)
    helper_test_op([()], lambda x: x.tan())
-    if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
+    if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
      helper_test_op(None, lambda x: x.tan(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
      helper_test_op(None, lambda x: x.tan(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
                    atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@ -3310,7 +3310,7 @@ class TestOps(unittest.TestCase):
    helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf))
    helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf))

-  @unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
+  @unittest.skipIf((DEV.interface.startswith("MOCK") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
  @unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
  @unittest.skipIf(Device.DEFAULT == "QCOM", "QCOM fails with: Resource deadlock avoided")
  def test_masked_select(self):
--- a/test/backend/test_profiler.py
+++ b/test/backend/test_profiler.py
@ -1,11 +1,11 @@
 import unittest, struct, contextlib, statistics, gc
 from tinygrad import Device, Tensor, dtypes, TinyJit
-from tinygrad.helpers import CI, getenv, Context, ProfileRangeEvent, cpu_profile, cpu_events, ProfilePointEvent, dedup
+from tinygrad.helpers import CI, DEV, Context, ProfileRangeEvent, cpu_profile, cpu_events, ProfilePointEvent, dedup
 from tinygrad.device import Buffer, BufferSpec, Compiled, ProfileDeviceEvent, ProfileGraphEvent
 from tinygrad.runtime.support.hcq import HCQCompiled
 from tinygrad.engine.realize import get_runner

-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")
 def _dev_base(d):
  p = d.split(":")
  return p[0] if len(p) < 2 or not p[1].isdigit() else f"{p[0]}:{p[1]}"
--- a/test/backend/test_subbuffer.py
+++ b/test/backend/test_subbuffer.py
@ -1,7 +1,7 @@
 import unittest
 from tinygrad import Device, dtypes, Tensor
 from tinygrad.device import Buffer
-from tinygrad.helpers import Context, getenv
+from tinygrad.helpers import Context, DEV
 from test.helpers import needs_second_gpu

@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
@ -42,7 +42,7 @@ class TestSubBuffer(unittest.TestCase):
    assert out == [102, 103]

  @needs_second_gpu
-  @unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or getenv("MOCKGPU"), "only NV, AMD, CUDA")
+  @unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or DEV.interface.startswith("MOCK"), "only NV, AMD, CUDA")
  def test_subbuffer_transfer(self):
    t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()
    vt = t[2:5].contiguous().realize()
--- a/test/backend/test_transcendental.py
+++ b/test/backend/test_transcendental.py
@ -14,7 +14,7 @@ settings.load_profile("my_profile")

 class TestTranscendentalMath(unittest.TestCase):
  @unittest.skipUnless(is_dtype_supported(dtypes.float64), f"no float64 on {Device.DEFAULT}")
-  @unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
+  @unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
  @given(ht.float64, strat.sampled_from([(Tensor.exp, np.exp), (Tensor.log, np.log), (Tensor.sin, np.sin)]))
  def test_float64(self, x, op):
    if op[0] == Tensor.sin:
@ -25,7 +25,7 @@ class TestTranscendentalMath(unittest.TestCase):
                                 op[1](np.array([x], dtype=_to_np_dtype(dtypes.float64))),
                                 atol=3e-2, rtol=1e-5)  # sin can have bigger atol for very big x

-  @unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
+  @unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
  @given(ht.float32, strat.sampled_from([(Tensor.exp, np.exp),(Tensor.log, np.log)] +
    ([(Tensor.sin, np.sin)] if is_dtype_supported(dtypes.ulong) else [])))
  def test_float32(self, x, op):
@ -66,7 +66,7 @@ class TestFromFuzzer(unittest.TestCase):
    if not is_dtype_supported(dtype): return
    if dtype == dtypes.float64:
      # crashes in CI CUDA
-      if getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}: return
+      if DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}: return
    def _test_value(n: float, unit: float=1.0):
      next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
      ulp = next_float - 1.0
@ -88,7 +88,7 @@ class TestFromFuzzer(unittest.TestCase):
    if not is_dtype_supported(dtype): return
    if dtype == dtypes.float64:
      # crashes in CI CUDA
-      if getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}: return
+      if DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}: return
    def _test_value(n: float, unit: float=1.0):
      next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
      ulp = next_float - 1.0
--- a/test/device/test_hcq.py
+++ b/test/device/test_hcq.py
@ -1,6 +1,6 @@
 import unittest, ctypes, struct, os, random, numpy as np, time
 from tinygrad import Device, Tensor, dtypes
-from tinygrad.helpers import getenv, mv_address, DEBUG, DEV
+from tinygrad.helpers import mv_address, DEBUG, DEV
 from test.helpers import slow, replace_opts
 from tinygrad.device import Buffer, BufferSpec
 from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
@ -10,7 +10,7 @@ from tinygrad.engine.realize import get_runner, CompiledRunner, get_program
 from tinygrad.codegen.opt import Opt, OptOps
 from tinygrad import Variable

-MOCKGPU = getenv("MOCKGPU")
+MOCKGPU = DEV.interface.startswith("MOCK")

@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "HCQ device required to run")
 class TestHCQ(unittest.TestCase):
@ -76,7 +76,7 @@ class TestHCQ(unittest.TestCase):
        TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
        TestHCQ.d0.timeline_value += 1

-  @unittest.skipIf(Device.DEFAULT in {"CPU"} or (DEV.interface == "PCI" and DEV.device == "AMD"), "Can't handle async update on CPU/MOCKAM device")
+  @unittest.skipIf(Device.DEFAULT == "CPU" or (DEV.interface == "MOCKPCI" and DEV.device == "AMD"), "Can't handle async update on CPU/MOCKPCI device")
  def test_wait_late_set(self):
    for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
      if queue_type is None: continue
@ -575,7 +575,7 @@ class TestHCQ(unittest.TestCase):

      np.testing.assert_equal(cpu_buffer.numpy(), local_buf.numpy(), "failed")

-  @unittest.skipUnless(MOCKGPU and not (DEV.device == "AMD" and DEV.interface == "PCI"), "Emulate this on MOCKGPU to check the path in CI")
+  @unittest.skipUnless(MOCKGPU and not (DEV.device == "AMD" and DEV.interface == "MOCKPCI"), "Emulate this on MOCKGPU to check the path in CI")
  def test_on_device_hang(self):
    if not hasattr(self.d0, 'on_device_hang'): self.skipTest("device does not have on_device_hang")

--- a/test/external/external_test_am_fault_recovery.py
+++ b/test/external/external_test_am_fault_recovery.py
@ -1,5 +1,6 @@
 # ruff: noqa: F405
 import unittest, subprocess, os
+from tinygrad.helpers import DEV
 from tinygrad.runtime.autogen.amd.rdna3.ins import *  # noqa: F403
 from tinygrad.renderer.amd.dsl import s, v, Inst, NULL

@ -27,7 +28,7 @@ _ILLEGAL_INST_ASM = ".text\n.globl test\n.p2align 8\n.type test,@function\ntest:
  ".rodata\n.p2align 6\n.amdhsa_kernel test\n.amdhsa_next_free_vgpr 8\n.amdhsa_next_free_sgpr 8\n" \
  ".amdhsa_wavefront_size32 1\n.amdhsa_user_sgpr_kernarg_segment_ptr 1\n.amdhsa_kernarg_size 8\n.end_amdhsa_kernel"

-@unittest.skipIf(os.environ.get("AMD") != "1" or os.environ.get("MOCKGPU") == "1", "AMD with AM driver required")
+@unittest.skipIf(DEV.device != "AMD" or not DEV.interface.startswith("MOCK"), "AMD with AM driver required")
 class TestAMFaultRecovery(unittest.TestCase):
  def _run_kernel(self, insts: list[Inst]) -> subprocess.CompletedProcess: return _run_asm(assemble_kernel(insts))

--- a/test/mockgpu/amd/README
+++ b/test/mockgpu/amd/README
@ -4,7 +4,7 @@ Test with `pytest -n12 test/amd/`
 `DEV=AMD:LLVM pytest -n12 test/amd/`

 * dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
-* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1`
+* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=MOCK{KFD|KFD|USB}+AMD`
 * generate.py -- extract assembly format + instruction pseudocode from AMD XML + PDF
 * test/mockgpu/amd/pcode.py -- pseudocode to UOp transformation
 * sqtt.py -- SQTT parser
@ -20,18 +20,18 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same

 tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.

-`DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
-`DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
+`DEV=MOCKKFD+AMD pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
+`DEV=MOCKKFD+AMD:LLVM pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`

 The ops tests also pass, but they are very slow, so you should run them one at a time.

-`SKIP_SLOW_TEST=1 DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
-`SKIP_SLOW_TEST=1 DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
+`SKIP_SLOW_TEST=1 DEV=MOCKKFD+AMD pytest -n=12 test/backend/test_ops.py`
+`SKIP_SLOW_TEST=1 DEV=NOCKKFD+AMD:LLVM pytest -n=12 test/backend/test_ops.py`

 When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
 While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
-If a test is failing with `DEV=AMD MOCKGPU=1` it's because an instruction is emulated incorrectly.
-You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
+If a test is failing with `DEV=MOCKKFD+AMD` it's because an instruction is emulated incorrectly.
+You can test with just `DEV=AMD` to test on real hardware, if it works on real hardware there's a bug in the emulator.
 IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.

 Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.
--- a/test/mockgpu/mockgpu.py
+++ b/test/mockgpu/mockgpu.py
@ -11,8 +11,8 @@ libc = ctypes.CDLL(ctypes.util.find_library("c"))
 libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
 libc.mmap.restype = ctypes.c_void_p

-_amd_iface = DEV.target("AMD").interface
-drivers = [NVDriver(), AMDriver() if _amd_iface == "PCI" else (AMUSBDriver() if _amd_iface == "USB" else AMDDriver())]
+drivers = [cls() for t in DEV.value if (cls:={"MOCKPCI+AMD": AMDriver, "MOCKKFD+AMD": AMDDriver, "MOCKUSB+AMD": AMUSBDriver,
+                                              "MOCKNVK+NV": NVDriver}.get(f"{t.interface}+{t.device}"))]
 tracked_fds = {}

 original_memoryview = builtins.memoryview
--- a/test/speed/external_test_speed_v_torch.py
+++ b/test/speed/external_test_speed_v_torch.py
@ -12,7 +12,7 @@ import numpy as np
 np.set_printoptions(linewidth=160)
 from tinygrad import Tensor, Device, GlobalCounters, TinyJit
 from tinygrad.nn import Conv2d
-from tinygrad.helpers import colorize_float, getenv, CI
+from tinygrad.helpers import colorize_float, getenv, CI, DEV

 IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]

@ -113,7 +113,7 @@ def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_
  helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))

@unittest.skipIf(getenv("BIG") == 0, "no big tests")
-@unittest.skipIf(getenv("MOCKGPU"), "no MOCKGPUs")
+@unittest.skipIf(DEV.interface.startswith("MOCK"), "no MOCKGPUs")
 class TestBigSpeed(unittest.TestCase):
  def test_add(self):
    def f(a, b): return a+b
@ -134,7 +134,7 @@ class TestBigSpeed(unittest.TestCase):
  def test_matvec_16384_4096(self): helper_test_matvec('matvec_16384_4096', 16384, 4096)

@unittest.skipIf(getenv("BIG") == 1, "only big tests")
-@unittest.skipIf(getenv("MOCKGPU"), "no MOCKGPUs")
+@unittest.skipIf(DEV.interface.startswith("MOCK"), "no MOCKGPUs")
 class TestSpeed(unittest.TestCase):
  def test_sub(self):
    def f(a, b): return a-b
--- a/test/testextra/test_mockgpu.py
+++ b/test/testextra/test_mockgpu.py
@ -1,7 +1,7 @@
-from tinygrad.helpers import getenv
+from tinygrad.helpers import DEV
 import unittest, importlib

-@unittest.skipUnless(getenv("MOCKGPU"), 'Testing mockgpu')
+@unittest.skipUnless(DEV.interface.startswith("MOCK"), 'Testing mockgpu')
 class TestMockGPU(unittest.TestCase):
  # https://github.com/tinygrad/tinygrad/pull/7627
  def test_import_typing_extensions(self):
--- a/test/unit/test_hashing.py
+++ b/test/unit/test_hashing.py
@ -1,13 +1,14 @@
 from typing_extensions import Callable
 import hashlib, random, unittest
-from tinygrad import Tensor, Device, getenv, dtypes
+from tinygrad import Tensor, Device, dtypes
+from tinygrad.helpers import DEV
 from test.helpers import slow
 from tinygrad.device import is_dtype_supported
 from tinygrad.uop.ops import UOp
 from tinygrad.engine.jit import TinyJit

@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
-@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
+@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in NV CI")
 class TestHashing(unittest.TestCase):
  def _python_hash_1mb(self, data:bytes):
    chunks = [data[i:i+4096] for i in range(0, len(data), 4096)]
@ -21,7 +22,7 @@ class TestHashing(unittest.TestCase):
    self.assertEqual(bytes(out.data()), expected)

@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
-@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
+@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in NV CI")
 class TestKeccak(unittest.TestCase):
  def setUp(self) -> None: random.seed(1337)

--- a/tinygrad/renderer/cstyle.py
+++ b/tinygrad/renderer/cstyle.py
@ -394,9 +394,8 @@ class CUDARenderer(CStyleLanguage):
  def __init__(self, target:Target, use_nvcc=False):
    super().__init__(target)
    from tinygrad.runtime.support.compiler_cuda import NVRTCCompiler, NVCCCompiler
-    from tinygrad.runtime.support.hcq import MOCKGPU
-    dev, arch = target.device, target.arch
-    self.compiler = (NVCCCompiler if use_nvcc else NVRTCCompiler)(arch, ptx=bool(MOCKGPU) or dev == "CUDA", cache_key=dev.lower())
+    iface, dev, arch = target.interface, target.device, target.arch
+    self.compiler = (NVCCCompiler if use_nvcc else NVRTCCompiler)(arch, ptx=iface.startswith("MOCK") or dev == "CUDA", cache_key=dev.lower())
    self.tensor_cores = tc.get_cuda(arch)

  # language options
--- a/tinygrad/renderer/ptx.py
+++ b/tinygrad/renderer/ptx.py
@ -145,8 +145,7 @@ class PTXRenderer(Renderer):
  def __init__(self, target:Target):
    super().__init__(target)
    from tinygrad.runtime.support.compiler_cuda import NVPTXCompiler, PTXCompiler
-    from tinygrad.runtime.support.hcq import MOCKGPU
-    self.compiler = (PTXCompiler if bool(MOCKGPU) or target.device == "CUDA" else NVPTXCompiler)(target.arch)
+    self.compiler = (PTXCompiler if target.interface.startswith("MOCK") or target.device == "CUDA" else NVPTXCompiler)(target.arch)
    self.tensor_cores = PTXRenderer.tc_sm80 if (ver:=int(target.arch[3:])) >= 80 else tc.cuda_sm75 if ver >= 75 else []

  # language options
--- a/tinygrad/runtime/graph/hcq.py
+++ b/tinygrad/runtime/graph/hcq.py
@ -325,5 +325,5 @@ class HCQGraph(MultiGraphRunner):
    if new_call.src[0].op is Ops.COPY:
      # MOCKGPU is not supported, since it can't execute commands in parallel
      is_xfer = len(set(type(d) for d in all_devs)) == 1 and hasattr(alc:=all_devs[0].allocator, '_transfer') and alc.supports_transfer
-      return is_xfer or (all_devs[0].hw_copy_queue_t is not None and not getenv("MOCKGPU"))
+      return is_xfer or (all_devs[0].hw_copy_queue_t is not None and not getattr(all_devs[0], 'iface', None).__class__.__name__.startswith("MOCK"))
    return _unwrap_beam(new_call.src[0]).op in (Ops.SINK, Ops.PROGRAM)
--- a/tinygrad/runtime/ops_amd.py
+++ b/tinygrad/runtime/ops_amd.py
@ -941,13 +941,15 @@ class USBIface(PCIIface):

  def sleep(self, timeout): pass

+def mock_iface(iface): return type(f"MOCK{iface.__name__}", (iface,), {})
+
 class AMDDevice(HCQCompiled):
  def is_am(self) -> bool: return isinstance(self.iface, (PCIIface, USBIface))
  def is_usb(self) -> bool: return isinstance(self.iface, USBIface)

  def __init__(self, device:str=""):
    self.device_id = int(device.split(":")[1]) if ":" in device else 0
-    self.iface = self._select_iface(KFDIface, PCIIface, USBIface)
+    self.iface = self._select_iface(KFDIface, PCIIface, USBIface, mock_iface(KFDIface), mock_iface(PCIIface), mock_iface(USBIface))
    self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
    self.arch = "gfx%d%x%x" % self.target
    if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}")
--- a/tinygrad/runtime/ops_cuda.py
+++ b/tinygrad/runtime/ops_cuda.py
@ -1,6 +1,6 @@
 from __future__ import annotations
 import ctypes, functools
-from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing
+from tinygrad.helpers import DEBUG, DEV, getenv, mv_address, suppress_finalizing
 from tinygrad.device import Compiled, BufferSpec, LRUAllocator
 from tinygrad.renderer.cstyle import CUDARenderer, NVCCRenderer
 from tinygrad.renderer.ptx import PTXRenderer
@ -8,7 +8,7 @@ from tinygrad.runtime.autogen import cuda
 from tinygrad.runtime.support.compiler_cuda import pretty_ptx
 from tinygrad.runtime.support.c import init_c_struct_t, init_c_var
 if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl  # noqa: F401  # pylint: disable=unused-import
-if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
+if (MOCKGPU:=DEV.target("CUDA").interface == "MOCK"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported

 def check(status):
  if status != 0:
--- a/tinygrad/runtime/ops_nv.py
+++ b/tinygrad/runtime/ops_nv.py
@ -4,7 +4,7 @@ assert sys.platform != 'win32'
 from typing import cast
 from dataclasses import dataclass
 from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
-from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
+from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, hcq_filter_visible_devices, hcq_profile
 from tinygrad.uop.ops import sint
 from tinygrad.device import Compiled, BufferSpec
 from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, hi32, lo32, PROFILE, ContextVar, VIZ, ProfileEvent
@ -240,7 +240,7 @@ class NVVideoQueue(NVCommandQueue):

 class NVArgsState(CLikeArgsState):
  def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
-    if MOCKGPU: prg.cbuf_0[80:82] = [len(bufs), len(vals)]
+    if isinstance(prg.dev.iface, MOCKNVKIface): prg.cbuf_0[80:82] = [len(bufs), len(vals)]
    super().__init__(buf, prg, bufs, vals=vals, prefix=prg.cbuf_0 or None)

 class NVProgram(HCQProgram):
@ -251,14 +251,14 @@ class NVProgram(HCQProgram):
    if (NAK:=isinstance(dev.renderer, NAKRenderer)):
      image, self.cbuf_0 = memoryview(bytearray(lib[ctypes.sizeof(info:=mesa.struct_nak_shader_info.from_buffer_copy(lib)):])), []
      self.regs_usage, self.shmem_usage, self.lcmem_usage = info.num_gprs, round_up(info.cs.smem_size, 128), round_up(info.slm_size, 16)
-    elif MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
+    elif isinstance(dev.iface, MOCKNVKIface): image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
    else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
    # NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
    self.lib_gpu = self.dev.allocator.alloc(round_up((prog_sz:=image.nbytes), 0x1000) + 0x1000, buf_spec:=BufferSpec(nolru=True))
    prog_addr = self.lib_gpu.va_addr
    if not NAK:
      # For MOCKGPU, the lib is PTX code, so some values are emulated.
-      self.regs_usage, self.shmem_usage, self.lcmem_usage, cbuf0_size = 0, 0x400, 0x240, 0 if not MOCKGPU else 0x160
+      self.regs_usage, self.shmem_usage, self.lcmem_usage, cbuf0_size = 0, 0x400, 0x240, 0x160 if isinstance(dev.iface, MOCKNVKIface) else 0
      for sh in sections: # pylint: disable=possibly-used-before-assignment
        if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
        if sh.name == f".text.{self.name}": prog_addr, prog_sz = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size
@ -472,7 +472,8 @@ class NVKIface:

  def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, cpu_addr=None, **kwargs) -> HCQBuffer:
    # Uncached memory is "system". Use huge pages only for gpu memory.
-    page_size = mmap.PAGESIZE if uncached or host else ((2 << 20) if size >= (8 << 20) else (mmap.PAGESIZE if MOCKGPU else 4 << 10))
+    page_size = mmap.PAGESIZE if uncached or host else ((2 << 20) if size >= (8 << 20) else (mmap.PAGESIZE if isinstance(self, MOCKNVKIface) else
+                                                                                             4 << 10))
    size = round_up(size, page_size)
    va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access) if (alloced:=cpu_addr is None) else cpu_addr

@ -575,12 +576,14 @@ class PCIIface(PCIIfaceBase):
    for _ in self.dev_impl.gsp.stat_q.read_resp(): pass
    if self.dev_impl.is_err_state: raise RuntimeError("Device fault detected")

+class MOCKNVKIface(NVKIface): pass
+
 class NVDevice(HCQCompiled[NVSignal]):
  def is_nvd(self) -> bool: return isinstance(self.iface, PCIIface)

  def __init__(self, device:str=""):
    self.device_id = int(device.split(":")[1]) if ":" in device else 0
-    self.iface = self._select_iface(NVKIface, PCIIface)
+    self.iface = self._select_iface(NVKIface, PCIIface, MOCKNVKIface)

    device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.iface.gpu_instance, hClientShare=self.iface.root,
                                                   vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_OPTIONAL_MULTIPLE_VASPACES)
--- a/tinygrad/runtime/support/hcq.py
+++ b/tinygrad/runtime/support/hcq.py
@ -56,7 +56,7 @@ class FileIOInterface:
  @staticmethod
  def eventfd(initval, flags=None): return FileIOInterface(fd=os.eventfd(initval, flags))  # type: ignore[attr-defined]

-if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface  # noqa: F401 # pylint: disable=unused-import
+if DEV.interface.startswith("MOCK"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface  # noqa: F401 # pylint: disable=unused-import

 # **************** for HCQ Compatible Devices ****************

@ -491,6 +491,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
      f"{k}={v} is deprecated, use DEV={replace(DEV.target(type(self).__name__[:-6]), interface=v)} instead"
    t = DEV.target(dev:=type(self).__name__[:-6])
    filtered = select_by_name(ifaces, lambda i: i.__name__[:-5], t.interface, f"{dev} has no interface {t.interface!r}")
+    filtered = [i for i in filtered if t.interface.startswith("MOCK") or not i.__name__[:-5].startswith("MOCK")] # never fallback to mock ifaces
    return select_first_inited([functools.partial(cast(Callable, iface), self, self.device_id) for iface in filtered],
                               f"No interface for {dev}:{self.device_id} is available")

--- a/tinygrad/runtime/support/usb.py
+++ b/tinygrad/runtime/support/usb.py
@ -1,7 +1,7 @@
 import ctypes, struct, dataclasses, array, itertools, time
 from typing import Sequence
 from tinygrad.runtime.autogen import libusb
-from tinygrad.helpers import DEBUG, to_mv, round_up, OSX, getenv, ceildiv
+from tinygrad.helpers import DEBUG, DEV, to_mv, round_up, OSX, getenv, ceildiv
 from tinygrad.runtime.support.hcq import MMIOInterface

 def alloc_cbuffer(sz:int) -> tuple[ctypes.Array, memoryview]: return (buf:=(ctypes.c_ubyte * sz)()), to_mv(ctypes.addressof(buf), sz)
@ -449,4 +449,4 @@ class USBMMIOInterface(MMIOInterface):
    _, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt))
    self.usb.pcie_mem_write(self.addr+off, [int.from_bytes(data[i:i+acc_sz], "little") for i in range(0, len(data), acc_sz)], acc_sz)

-if getenv("MOCKGPU"): from test.mockgpu.usb import MockUSB3 as USB3  # type: ignore  # noqa: F811
+if DEV.interface.startswith("MOCK"): from test.mockgpu.usb import MockUSB3 as USB3  # type: ignore  # noqa: F811