MOCKGPU interfaces (#15796)

This commit is contained in:
Christopher Milan 2026-04-17 18:56:29 -07:00 committed by GitHub
commit 6adf4c3cd9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
29 changed files with 94 additions and 95 deletions

View file

@ -643,8 +643,7 @@ jobs:
runs-on: ubuntu-24.04
timeout-minutes: 20
env:
DEV: AMD
MOCKGPU: 1
DEV: MOCKKFD+AMD
steps:
- name: Checkout Code
uses: actions/checkout@v6
@ -670,7 +669,7 @@ jobs:
- name: Run AMD renderer tests
run: python -m pytest -n=auto test/amd/ --durations 20
- name: Run AMD renderer tests (AMD:LLVM)
run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
run: DEV=MOCKKFD+AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
- name: Run SQTT profiling tests
run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
- name: Run AMD emulated tests on NULL backend
@ -679,20 +678,19 @@ jobs:
run: |
PYTHONPATH=. DEV=NULL::gfx1100 python extra/mmapeak/mmapeak.py
PYTHONPATH=. DEV=NULL::gfx1201 python3 -m pytest -n=auto test/testextra/test_tk.py test/backend/test_asm_gemm.py
- name: Run matmul on MOCKGPU
- name: Run matmul on MOCKKFD
run: |
PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_copy_matmul.py
PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_asm_matmul.py
PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_copy_matmul.py
- name: Run LLVM test
run: DEV=AMD:LLVM python test/device/test_amd_llvm.py
run: DEV=MOCKKFD+AMD:LLVM python test/device/test_amd_llvm.py
testmockam:
name: Linux (am)
runs-on: ubuntu-24.04
timeout-minutes: 15
env:
DEV: PCI+AMD
MOCKGPU: 1
DEV: MOCKPCI+AMD
steps:
- name: Checkout Code
uses: actions/checkout@v6
@ -704,13 +702,13 @@ jobs:
amd: 'true'
- name: Run test_tiny on MOCKAM
run: python test/test_tiny.py
- name: Run test_tiny on MOCKAM USB
run: GMMU=0 DEV=USB+AMD python test/test_tiny.py
- name: Run test_hcq on MOCKAM
- name: Run test_tiny on MOCKUSB
run: GMMU=0 DEV=MOCKUSB+AMD python test/test_tiny.py
- name: Run test_hcq on MOCKPCI
run: python -m pytest test/device/test_hcq.py
- name: Run disk copy tests on MOCKAM
- name: Run disk copy tests on MOCKPCI
run: python -m pytest test/unit/test_disk_tensor.py -k test_copy_from_disk
- name: Run test_tiny on MOCKAM Remote
- name: Run test_tiny on MOCKPCI Remote
run: |
python extra/remote/serve.py 6667 &
sleep 2
@ -728,8 +726,7 @@ jobs:
runs-on: ubuntu-22.04
timeout-minutes: 15
env:
DEV: AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
MOCKGPU: 1
DEV: MOCKKFD+AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
SKIP_SLOW_TEST: 1
steps:
- name: Checkout Code
@ -764,7 +761,6 @@ jobs:
runs-on: ubuntu-22.04
timeout-minutes: 20
env:
MOCKGPU: 1
FORWARD_ONLY: 1
steps:
- name: Checkout Code
@ -777,7 +773,7 @@ jobs:
cuda: 'true'
ocelot: 'true'
- name: Set env
run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'ptx' && 'DEV=MOCK+CUDA:PTX' || matrix.backend == 'nv' && 'DEV=MOCKNVK+NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
- name: Check Device.DEFAULT and print some source
run: |
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
@ -862,22 +858,19 @@ jobs:
run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
- name: Run pytest (amd)
env:
MOCKGPU: 1
DEV: AMD
DEV: MOCKKFD+AMD
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
- name: Run pytest (amd with llvm backend)
env:
MOCKGPU: 1
DEV: "AMD:LLVM"
DEV: "MOCKKFD+AMD:LLVM"
FORWARD_ONLY: 1
run: |
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
- name: Run pytest (ptx)
env:
MOCKGPU: 1
DEV: "NV:PTX"
DEV: "MOCKNVK+NV:PTX"
FORWARD_ONLY: 1
# TODO: failing due to library loading error
CAPTURE_PROCESS_REPLAY: 0

View file

@ -1,9 +1,9 @@
# tinygrad allows you to write kernels at many different abstractions levels.
# This is for RDNA3, but if you don't have one you can run with the emulator
# PYTHONPATH="." MOCKGPU=1 DEV=AMD
# PYTHONPATH="." DEV=MOCKPCI+AMD
from tinygrad import Tensor, Context, GlobalCounters, UOp, Device
from tinygrad.helpers import DEBUG, getenv
from tinygrad.helpers import DEV, DEBUG, getenv
from tinygrad.uop.ops import AxisType, KernelInfo, Ops
from tinygrad.dtype import AddrSpace, dtypes
from tinygrad.runtime.autogen.amd.rdna3.ins import *
@ -16,7 +16,7 @@ def eval_harness(name, tensor, fxn, check=None):
print(f"computed in {GlobalCounters.time_sum_s*1000:.2f} ms, {(a.nbytes()/1e9)/GlobalCounters.time_sum_s:.2f} GB/s")
return out
SZ = 256*1024 if getenv("MOCKGPU") else 1024*1024*1024
SZ = 256*1024 if DEV.interface.startswith("MOCK") else 1024*1024*1024
def example_2_hip(a:Tensor, correct):
GLOBALS = 1024

View file

@ -37,8 +37,7 @@ dev.synchronize()
'''
env = os.environ.copy()
env["AMD"] = "1"
env["MOCKGPU"] = "1"
env["DEV"] = "MOCKKFD+AMD"
env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"
st = time.perf_counter()

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""Tests for SQTT encoder: verifies the emulator produces correct SQTT traces for known kernels.
Run with: DEV=AMD MOCKGPU=1 python -m pytest test/amd/test_sqtt_encoder.py -v
Run with: DEV=MOCKKFD+AMD python -m pytest test/amd/test_sqtt_encoder.py -v
"""
import ctypes, unittest
from tinygrad.helpers import Context

View file

@ -1,7 +1,7 @@
import unittest
from tinygrad import Tensor, Device, dtypes, Context
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import getenv, system
from tinygrad.helpers import getenv, system, DEV
from extra.gemm.cdna_asm_gemm import asm_gemm
from test.helpers import needs_second_gpu
from examples.mlperf.models.flat_llama import FP8_DTYPE
@ -131,7 +131,7 @@ class TestGemmLlama(unittest.TestCase):
dtype = dtypes.bfloat16
def setUp(self):
if not is_cdna4() or getenv("MOCKGPU"):
if not is_cdna4() or DEV.interface.startswith("MOCK"):
self.skipTest("very slow on non mi350x")
def test_empty(self): asm_gemm(Tensor.empty(N:=getenv("N", 4096), N, dtype=self.dtype), Tensor.empty(N, N, dtype=self.dtype)).realize()

View file

@ -1,7 +1,7 @@
import unittest, operator, math
from tinygrad import Context, Tensor, dtypes, Device
from tinygrad.dtype import DType, truncate, fp8_to_float
from tinygrad.helpers import CI, EMULATED_DTYPES, getenv
from tinygrad.helpers import CI, EMULATED_DTYPES, DEV, getenv
from tinygrad.tensor import _to_np_dtype
from tinygrad.device import is_dtype_supported
from tinygrad.runtime.ops_python import from_storage_scalar
@ -32,7 +32,8 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), (Tensor.sin, np.
#binary_operations.append(operator.truediv)
# TODO: CI CUDA segfaults on sin, WEBGPU and NIR sines are not precise enough for large numbers
if (getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}) or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer):
if ((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"})
or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer)):
unary_operations.remove((Tensor.sin, np.sin))
unary_operations.remove((Tensor.cos, np.cos))

View file

@ -27,10 +27,10 @@ import numpy as np
import torch
from tinygrad import Tensor, dtypes, nn
from tinygrad.device import Device
from tinygrad.helpers import getenv
from tinygrad.helpers import DEV
from tinygrad.renderer.nir import NIRRenderer
MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")
class TestNaNEdgeCases(unittest.TestCase):
# we don't need more of these. it's unclear if torch's behavior is desired here

View file

@ -3,12 +3,12 @@ import unittest
import torch
import numpy as np
from tinygrad.helpers import getenv, CI
from tinygrad.helpers import CI, DEV
from tinygrad.tensor import Tensor
from tinygrad.device import Device
from tinygrad.dtype import _from_torch_dtype, _to_torch_dtype
MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")
@unittest.skipIf(Device.DEFAULT not in ["METAL", "CUDA"] or MOCKGPU, f"no support on {Device.DEFAULT}")
class TestInterop(unittest.TestCase):

View file

@ -8,7 +8,7 @@ from tinygrad.tensor import Tensor
from tinygrad.engine.jit import TinyJit, JitError, GraphRunner, MultiGraphRunner, graph_class
from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
from tinygrad.device import Device
from tinygrad.helpers import Context, JIT, GlobalCounters, getenv
from tinygrad.helpers import Context, JIT, DEV, GlobalCounters
from tinygrad.dtype import dtypes
from extra.models.unet import ResBlock
@ -812,7 +812,7 @@ class TestJitGraphSplit(unittest.TestCase):
hcqgraph=[self.ji_graph(6)])
@unittest.skip("this fails if you don't have SDMA or are using AMD_DISABLE_SDMA=1")
@unittest.skipIf(getenv("MOCKGPU"), "MockGPU does not support parallel copies")
@unittest.skipIf(DEV.interface.startswith("MOCK"), "MockGPU does not support parallel copies")
def test_jit_multidev_copy(self):
if Device.DEFAULT in {"CPU"}: raise unittest.SkipTest("CPU/LLVM is not a valid default device for this test (zero-copies)")

View file

@ -7,12 +7,12 @@ from tinygrad.uop.ops import UOp, Ops, GroupOp, AxisType
from tinygrad.device import Device, Buffer, is_dtype_supported
from tinygrad.tensor import Tensor, _to_np_dtype
from tinygrad.engine.realize import run_schedule, CompiledRunner, get_program
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, getenv
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, DEV
from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.cstyle import CUDARenderer
from test.helpers import replace_opts
MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")
from tinygrad.uop.ops import print_uops # noqa: F401 # pylint: disable=unused-import

View file

@ -94,7 +94,7 @@ def prepare_test_op(low, high, shps, vals, forward_only=False):
class TestOps(unittest.TestCase):
def helper_test_exception(self, shps, torch_fxn, tinygrad_fxn=None, expected=None, forward_only=False, exact=False, vals=None, low=-1.5, high=1.5):
if getenv("MOCKGPU") and Device.DEFAULT == "NV": self.skipTest('helper_test_exception fails in CI CUDA')
if DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV": self.skipTest('helper_test_exception fails in CI CUDA')
ts, tst = prepare_test_op(low, high, shps, vals, forward_only)
if tinygrad_fxn is None:
tinygrad_fxn = torch_fxn
@ -877,7 +877,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(45,65)], lambda x: x.sin())
helper_test_op([()], lambda x: x.sin())
# works on real CUDA but not CI
if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
helper_test_op(None, lambda x: x.sin(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
helper_test_op(None, lambda x: x.sin(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@ -886,7 +886,7 @@ class TestOps(unittest.TestCase):
def test_cos(self):
helper_test_op([(45,65)], lambda x: x.cos())
helper_test_op([()], lambda x: x.cos())
if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
helper_test_op(None, lambda x: x.cos(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@ -897,7 +897,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(45,65)], lambda x: x.tan(), low=-1.5, high=1.5)
helper_test_op([(45,65)], lambda x: x.tan(), low=-5, high=5)
helper_test_op([()], lambda x: x.tan())
if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
helper_test_op(None, lambda x: x.tan(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
helper_test_op(None, lambda x: x.tan(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
@ -3310,7 +3310,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf))
helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf))
@unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
@unittest.skipIf((DEV.interface.startswith("MOCK") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
@unittest.skipIf(Device.DEFAULT == "QCOM", "QCOM fails with: Resource deadlock avoided")
def test_masked_select(self):

View file

@ -1,11 +1,11 @@
import unittest, struct, contextlib, statistics, gc
from tinygrad import Device, Tensor, dtypes, TinyJit
from tinygrad.helpers import CI, getenv, Context, ProfileRangeEvent, cpu_profile, cpu_events, ProfilePointEvent, dedup
from tinygrad.helpers import CI, DEV, Context, ProfileRangeEvent, cpu_profile, cpu_events, ProfilePointEvent, dedup
from tinygrad.device import Buffer, BufferSpec, Compiled, ProfileDeviceEvent, ProfileGraphEvent
from tinygrad.runtime.support.hcq import HCQCompiled
from tinygrad.engine.realize import get_runner
MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")
def _dev_base(d):
p = d.split(":")
return p[0] if len(p) < 2 or not p[1].isdigit() else f"{p[0]}:{p[1]}"

View file

@ -1,7 +1,7 @@
import unittest
from tinygrad import Device, dtypes, Tensor
from tinygrad.device import Buffer
from tinygrad.helpers import Context, getenv
from tinygrad.helpers import Context, DEV
from test.helpers import needs_second_gpu
@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
@ -42,7 +42,7 @@ class TestSubBuffer(unittest.TestCase):
assert out == [102, 103]
@needs_second_gpu
@unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or getenv("MOCKGPU"), "only NV, AMD, CUDA")
@unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or DEV.interface.startswith("MOCK"), "only NV, AMD, CUDA")
def test_subbuffer_transfer(self):
t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()
vt = t[2:5].contiguous().realize()

View file

@ -14,7 +14,7 @@ settings.load_profile("my_profile")
class TestTranscendentalMath(unittest.TestCase):
@unittest.skipUnless(is_dtype_supported(dtypes.float64), f"no float64 on {Device.DEFAULT}")
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
@given(ht.float64, strat.sampled_from([(Tensor.exp, np.exp), (Tensor.log, np.log), (Tensor.sin, np.sin)]))
def test_float64(self, x, op):
if op[0] == Tensor.sin:
@ -25,7 +25,7 @@ class TestTranscendentalMath(unittest.TestCase):
op[1](np.array([x], dtype=_to_np_dtype(dtypes.float64))),
atol=3e-2, rtol=1e-5) # sin can have bigger atol for very big x
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
@given(ht.float32, strat.sampled_from([(Tensor.exp, np.exp),(Tensor.log, np.log)] +
([(Tensor.sin, np.sin)] if is_dtype_supported(dtypes.ulong) else [])))
def test_float32(self, x, op):
@ -66,7 +66,7 @@ class TestFromFuzzer(unittest.TestCase):
if not is_dtype_supported(dtype): return
if dtype == dtypes.float64:
# crashes in CI CUDA
if getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}: return
if DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}: return
def _test_value(n: float, unit: float=1.0):
next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
ulp = next_float - 1.0
@ -88,7 +88,7 @@ class TestFromFuzzer(unittest.TestCase):
if not is_dtype_supported(dtype): return
if dtype == dtypes.float64:
# crashes in CI CUDA
if getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}: return
if DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}: return
def _test_value(n: float, unit: float=1.0):
next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
ulp = next_float - 1.0

View file

@ -1,6 +1,6 @@
import unittest, ctypes, struct, os, random, numpy as np, time
from tinygrad import Device, Tensor, dtypes
from tinygrad.helpers import getenv, mv_address, DEBUG, DEV
from tinygrad.helpers import mv_address, DEBUG, DEV
from test.helpers import slow, replace_opts
from tinygrad.device import Buffer, BufferSpec
from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
@ -10,7 +10,7 @@ from tinygrad.engine.realize import get_runner, CompiledRunner, get_program
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad import Variable
MOCKGPU = getenv("MOCKGPU")
MOCKGPU = DEV.interface.startswith("MOCK")
@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "HCQ device required to run")
class TestHCQ(unittest.TestCase):
@ -76,7 +76,7 @@ class TestHCQ(unittest.TestCase):
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
TestHCQ.d0.timeline_value += 1
@unittest.skipIf(Device.DEFAULT in {"CPU"} or (DEV.interface == "PCI" and DEV.device == "AMD"), "Can't handle async update on CPU/MOCKAM device")
@unittest.skipIf(Device.DEFAULT == "CPU" or (DEV.interface == "MOCKPCI" and DEV.device == "AMD"), "Can't handle async update on CPU/MOCKPCI device")
def test_wait_late_set(self):
for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
if queue_type is None: continue
@ -575,7 +575,7 @@ class TestHCQ(unittest.TestCase):
np.testing.assert_equal(cpu_buffer.numpy(), local_buf.numpy(), "failed")
@unittest.skipUnless(MOCKGPU and not (DEV.device == "AMD" and DEV.interface == "PCI"), "Emulate this on MOCKGPU to check the path in CI")
@unittest.skipUnless(MOCKGPU and not (DEV.device == "AMD" and DEV.interface == "MOCKPCI"), "Emulate this on MOCKGPU to check the path in CI")
def test_on_device_hang(self):
if not hasattr(self.d0, 'on_device_hang'): self.skipTest("device does not have on_device_hang")

View file

@ -1,5 +1,6 @@
# ruff: noqa: F405
import unittest, subprocess, os
from tinygrad.helpers import DEV
from tinygrad.runtime.autogen.amd.rdna3.ins import * # noqa: F403
from tinygrad.renderer.amd.dsl import s, v, Inst, NULL
@ -27,7 +28,7 @@ _ILLEGAL_INST_ASM = ".text\n.globl test\n.p2align 8\n.type test,@function\ntest:
".rodata\n.p2align 6\n.amdhsa_kernel test\n.amdhsa_next_free_vgpr 8\n.amdhsa_next_free_sgpr 8\n" \
".amdhsa_wavefront_size32 1\n.amdhsa_user_sgpr_kernarg_segment_ptr 1\n.amdhsa_kernarg_size 8\n.end_amdhsa_kernel"
@unittest.skipIf(os.environ.get("AMD") != "1" or os.environ.get("MOCKGPU") == "1", "AMD with AM driver required")
@unittest.skipIf(DEV.device != "AMD" or not DEV.interface.startswith("MOCK"), "AMD with AM driver required")
class TestAMFaultRecovery(unittest.TestCase):
def _run_kernel(self, insts: list[Inst]) -> subprocess.CompletedProcess: return _run_asm(assemble_kernel(insts))

View file

@ -4,7 +4,7 @@ Test with `pytest -n12 test/amd/`
`DEV=AMD:LLVM pytest -n12 test/amd/`
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1`
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=MOCK{KFD|KFD|USB}+AMD`
* generate.py -- extract assembly format + instruction pseudocode from AMD XML + PDF
* test/mockgpu/amd/pcode.py -- pseudocode to UOp transformation
* sqtt.py -- SQTT parser
@ -20,18 +20,18 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
`DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=MOCKKFD+AMD pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
`DEV=MOCKKFD+AMD:LLVM pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
The ops tests also pass, but they are very slow, so you should run them one at a time.
`SKIP_SLOW_TEST=1 DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=MOCKKFD+AMD pytest -n=12 test/backend/test_ops.py`
`SKIP_SLOW_TEST=1 DEV=NOCKKFD+AMD:LLVM pytest -n=12 test/backend/test_ops.py`
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
If a test is failing with `DEV=AMD MOCKGPU=1` it's because an instruction is emulated incorrectly.
You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
If a test is failing with `DEV=MOCKKFD+AMD` it's because an instruction is emulated incorrectly.
You can test with just `DEV=AMD` to test on real hardware, if it works on real hardware there's a bug in the emulator.
IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.
Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.

View file

@ -11,8 +11,8 @@ libc = ctypes.CDLL(ctypes.util.find_library("c"))
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
libc.mmap.restype = ctypes.c_void_p
_amd_iface = DEV.target("AMD").interface
drivers = [NVDriver(), AMDriver() if _amd_iface == "PCI" else (AMUSBDriver() if _amd_iface == "USB" else AMDDriver())]
drivers = [cls() for t in DEV.value if (cls:={"MOCKPCI+AMD": AMDriver, "MOCKKFD+AMD": AMDDriver, "MOCKUSB+AMD": AMUSBDriver,
"MOCKNVK+NV": NVDriver}.get(f"{t.interface}+{t.device}"))]
tracked_fds = {}
original_memoryview = builtins.memoryview

View file

@ -12,7 +12,7 @@ import numpy as np
np.set_printoptions(linewidth=160)
from tinygrad import Tensor, Device, GlobalCounters, TinyJit
from tinygrad.nn import Conv2d
from tinygrad.helpers import colorize_float, getenv, CI
from tinygrad.helpers import colorize_float, getenv, CI, DEV
IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
@ -113,7 +113,7 @@ def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
@unittest.skipIf(getenv("BIG") == 0, "no big tests")
@unittest.skipIf(getenv("MOCKGPU"), "no MOCKGPUs")
@unittest.skipIf(DEV.interface.startswith("MOCK"), "no MOCKGPUs")
class TestBigSpeed(unittest.TestCase):
def test_add(self):
def f(a, b): return a+b
@ -134,7 +134,7 @@ class TestBigSpeed(unittest.TestCase):
def test_matvec_16384_4096(self): helper_test_matvec('matvec_16384_4096', 16384, 4096)
@unittest.skipIf(getenv("BIG") == 1, "only big tests")
@unittest.skipIf(getenv("MOCKGPU"), "no MOCKGPUs")
@unittest.skipIf(DEV.interface.startswith("MOCK"), "no MOCKGPUs")
class TestSpeed(unittest.TestCase):
def test_sub(self):
def f(a, b): return a-b

View file

@ -1,7 +1,7 @@
from tinygrad.helpers import getenv
from tinygrad.helpers import DEV
import unittest, importlib
@unittest.skipUnless(getenv("MOCKGPU"), 'Testing mockgpu')
@unittest.skipUnless(DEV.interface.startswith("MOCK"), 'Testing mockgpu')
class TestMockGPU(unittest.TestCase):
# https://github.com/tinygrad/tinygrad/pull/7627
def test_import_typing_extensions(self):

View file

@ -1,13 +1,14 @@
from typing_extensions import Callable
import hashlib, random, unittest
from tinygrad import Tensor, Device, getenv, dtypes
from tinygrad import Tensor, Device, dtypes
from tinygrad.helpers import DEV
from test.helpers import slow
from tinygrad.device import is_dtype_supported
from tinygrad.uop.ops import UOp
from tinygrad.engine.jit import TinyJit
@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in NV CI")
class TestHashing(unittest.TestCase):
def _python_hash_1mb(self, data:bytes):
chunks = [data[i:i+4096] for i in range(0, len(data), 4096)]
@ -21,7 +22,7 @@ class TestHashing(unittest.TestCase):
self.assertEqual(bytes(out.data()), expected)
@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in NV CI")
class TestKeccak(unittest.TestCase):
def setUp(self) -> None: random.seed(1337)

View file

@ -394,9 +394,8 @@ class CUDARenderer(CStyleLanguage):
def __init__(self, target:Target, use_nvcc=False):
super().__init__(target)
from tinygrad.runtime.support.compiler_cuda import NVRTCCompiler, NVCCCompiler
from tinygrad.runtime.support.hcq import MOCKGPU
dev, arch = target.device, target.arch
self.compiler = (NVCCCompiler if use_nvcc else NVRTCCompiler)(arch, ptx=bool(MOCKGPU) or dev == "CUDA", cache_key=dev.lower())
iface, dev, arch = target.interface, target.device, target.arch
self.compiler = (NVCCCompiler if use_nvcc else NVRTCCompiler)(arch, ptx=iface.startswith("MOCK") or dev == "CUDA", cache_key=dev.lower())
self.tensor_cores = tc.get_cuda(arch)
# language options

View file

@ -145,8 +145,7 @@ class PTXRenderer(Renderer):
def __init__(self, target:Target):
super().__init__(target)
from tinygrad.runtime.support.compiler_cuda import NVPTXCompiler, PTXCompiler
from tinygrad.runtime.support.hcq import MOCKGPU
self.compiler = (PTXCompiler if bool(MOCKGPU) or target.device == "CUDA" else NVPTXCompiler)(target.arch)
self.compiler = (PTXCompiler if target.interface.startswith("MOCK") or target.device == "CUDA" else NVPTXCompiler)(target.arch)
self.tensor_cores = PTXRenderer.tc_sm80 if (ver:=int(target.arch[3:])) >= 80 else tc.cuda_sm75 if ver >= 75 else []
# language options

View file

@ -325,5 +325,5 @@ class HCQGraph(MultiGraphRunner):
if new_call.src[0].op is Ops.COPY:
# MOCKGPU is not supported, since it can't execute commands in parallel
is_xfer = len(set(type(d) for d in all_devs)) == 1 and hasattr(alc:=all_devs[0].allocator, '_transfer') and alc.supports_transfer
return is_xfer or (all_devs[0].hw_copy_queue_t is not None and not getenv("MOCKGPU"))
return is_xfer or (all_devs[0].hw_copy_queue_t is not None and not getattr(all_devs[0], 'iface', None).__class__.__name__.startswith("MOCK"))
return _unwrap_beam(new_call.src[0]).op in (Ops.SINK, Ops.PROGRAM)

View file

@ -941,13 +941,15 @@ class USBIface(PCIIface):
def sleep(self, timeout): pass
def mock_iface(iface): return type(f"MOCK{iface.__name__}", (iface,), {})
class AMDDevice(HCQCompiled):
def is_am(self) -> bool: return isinstance(self.iface, (PCIIface, USBIface))
def is_usb(self) -> bool: return isinstance(self.iface, USBIface)
def __init__(self, device:str=""):
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.iface = self._select_iface(KFDIface, PCIIface, USBIface)
self.iface = self._select_iface(KFDIface, PCIIface, USBIface, mock_iface(KFDIface), mock_iface(PCIIface), mock_iface(USBIface))
self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
self.arch = "gfx%d%x%x" % self.target
if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}")

View file

@ -1,6 +1,6 @@
from __future__ import annotations
import ctypes, functools
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing
from tinygrad.helpers import DEBUG, DEV, getenv, mv_address, suppress_finalizing
from tinygrad.device import Compiled, BufferSpec, LRUAllocator
from tinygrad.renderer.cstyle import CUDARenderer, NVCCRenderer
from tinygrad.renderer.ptx import PTXRenderer
@ -8,7 +8,7 @@ from tinygrad.runtime.autogen import cuda
from tinygrad.runtime.support.compiler_cuda import pretty_ptx
from tinygrad.runtime.support.c import init_c_struct_t, init_c_var
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
if (MOCKGPU:=DEV.target("CUDA").interface == "MOCK"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
def check(status):
if status != 0:

View file

@ -4,7 +4,7 @@ assert sys.platform != 'win32'
from typing import cast
from dataclasses import dataclass
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, hcq_filter_visible_devices, hcq_profile
from tinygrad.uop.ops import sint
from tinygrad.device import Compiled, BufferSpec
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, hi32, lo32, PROFILE, ContextVar, VIZ, ProfileEvent
@ -240,7 +240,7 @@ class NVVideoQueue(NVCommandQueue):
class NVArgsState(CLikeArgsState):
def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
if MOCKGPU: prg.cbuf_0[80:82] = [len(bufs), len(vals)]
if isinstance(prg.dev.iface, MOCKNVKIface): prg.cbuf_0[80:82] = [len(bufs), len(vals)]
super().__init__(buf, prg, bufs, vals=vals, prefix=prg.cbuf_0 or None)
class NVProgram(HCQProgram):
@ -251,14 +251,14 @@ class NVProgram(HCQProgram):
if (NAK:=isinstance(dev.renderer, NAKRenderer)):
image, self.cbuf_0 = memoryview(bytearray(lib[ctypes.sizeof(info:=mesa.struct_nak_shader_info.from_buffer_copy(lib)):])), []
self.regs_usage, self.shmem_usage, self.lcmem_usage = info.num_gprs, round_up(info.cs.smem_size, 128), round_up(info.slm_size, 16)
elif MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
elif isinstance(dev.iface, MOCKNVKIface): image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
self.lib_gpu = self.dev.allocator.alloc(round_up((prog_sz:=image.nbytes), 0x1000) + 0x1000, buf_spec:=BufferSpec(nolru=True))
prog_addr = self.lib_gpu.va_addr
if not NAK:
# For MOCKGPU, the lib is PTX code, so some values are emulated.
self.regs_usage, self.shmem_usage, self.lcmem_usage, cbuf0_size = 0, 0x400, 0x240, 0 if not MOCKGPU else 0x160
self.regs_usage, self.shmem_usage, self.lcmem_usage, cbuf0_size = 0, 0x400, 0x240, 0x160 if isinstance(dev.iface, MOCKNVKIface) else 0
for sh in sections: # pylint: disable=possibly-used-before-assignment
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
if sh.name == f".text.{self.name}": prog_addr, prog_sz = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size
@ -472,7 +472,8 @@ class NVKIface:
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, cpu_addr=None, **kwargs) -> HCQBuffer:
# Uncached memory is "system". Use huge pages only for gpu memory.
page_size = mmap.PAGESIZE if uncached or host else ((2 << 20) if size >= (8 << 20) else (mmap.PAGESIZE if MOCKGPU else 4 << 10))
page_size = mmap.PAGESIZE if uncached or host else ((2 << 20) if size >= (8 << 20) else (mmap.PAGESIZE if isinstance(self, MOCKNVKIface) else
4 << 10))
size = round_up(size, page_size)
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access) if (alloced:=cpu_addr is None) else cpu_addr
@ -575,12 +576,14 @@ class PCIIface(PCIIfaceBase):
for _ in self.dev_impl.gsp.stat_q.read_resp(): pass
if self.dev_impl.is_err_state: raise RuntimeError("Device fault detected")
class MOCKNVKIface(NVKIface): pass
class NVDevice(HCQCompiled[NVSignal]):
def is_nvd(self) -> bool: return isinstance(self.iface, PCIIface)
def __init__(self, device:str=""):
self.device_id = int(device.split(":")[1]) if ":" in device else 0
self.iface = self._select_iface(NVKIface, PCIIface)
self.iface = self._select_iface(NVKIface, PCIIface, MOCKNVKIface)
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.iface.gpu_instance, hClientShare=self.iface.root,
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_OPTIONAL_MULTIPLE_VASPACES)

View file

@ -56,7 +56,7 @@ class FileIOInterface:
@staticmethod
def eventfd(initval, flags=None): return FileIOInterface(fd=os.eventfd(initval, flags)) # type: ignore[attr-defined]
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface # noqa: F401 # pylint: disable=unused-import
if DEV.interface.startswith("MOCK"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface # noqa: F401 # pylint: disable=unused-import
# **************** for HCQ Compatible Devices ****************
@ -491,6 +491,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
f"{k}={v} is deprecated, use DEV={replace(DEV.target(type(self).__name__[:-6]), interface=v)} instead"
t = DEV.target(dev:=type(self).__name__[:-6])
filtered = select_by_name(ifaces, lambda i: i.__name__[:-5], t.interface, f"{dev} has no interface {t.interface!r}")
filtered = [i for i in filtered if t.interface.startswith("MOCK") or not i.__name__[:-5].startswith("MOCK")] # never fallback to mock ifaces
return select_first_inited([functools.partial(cast(Callable, iface), self, self.device_id) for iface in filtered],
f"No interface for {dev}:{self.device_id} is available")

View file

@ -1,7 +1,7 @@
import ctypes, struct, dataclasses, array, itertools, time
from typing import Sequence
from tinygrad.runtime.autogen import libusb
from tinygrad.helpers import DEBUG, to_mv, round_up, OSX, getenv, ceildiv
from tinygrad.helpers import DEBUG, DEV, to_mv, round_up, OSX, getenv, ceildiv
from tinygrad.runtime.support.hcq import MMIOInterface
def alloc_cbuffer(sz:int) -> tuple[ctypes.Array, memoryview]: return (buf:=(ctypes.c_ubyte * sz)()), to_mv(ctypes.addressof(buf), sz)
@ -449,4 +449,4 @@ class USBMMIOInterface(MMIOInterface):
_, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt))
self.usb.pcie_mem_write(self.addr+off, [int.from_bytes(data[i:i+acc_sz], "little") for i in range(0, len(data), acc_sz)], acc_sz)
if getenv("MOCKGPU"): from test.mockgpu.usb import MockUSB3 as USB3 # type: ignore # noqa: F811
if DEV.interface.startswith("MOCK"): from test.mockgpu.usb import MockUSB3 as USB3 # type: ignore # noqa: F811