mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
MOCKGPU interfaces (#15796)
This commit is contained in:
parent
8da308573f
commit
6adf4c3cd9
29 changed files with 94 additions and 95 deletions
41
.github/workflows/test.yml
vendored
41
.github/workflows/test.yml
vendored
|
|
@ -643,8 +643,7 @@ jobs:
|
|||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 20
|
||||
env:
|
||||
DEV: AMD
|
||||
MOCKGPU: 1
|
||||
DEV: MOCKKFD+AMD
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v6
|
||||
|
|
@ -670,7 +669,7 @@ jobs:
|
|||
- name: Run AMD renderer tests
|
||||
run: python -m pytest -n=auto test/amd/ --durations 20
|
||||
- name: Run AMD renderer tests (AMD:LLVM)
|
||||
run: DEV=AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
|
||||
run: DEV=MOCKKFD+AMD:LLVM python -m pytest -n=auto test/amd/ --durations 20
|
||||
- name: Run SQTT profiling tests
|
||||
run: PROFILE=1 SQTT=1 python3 -m pytest -n=auto test/amd/test_sqtt_profiler.py
|
||||
- name: Run AMD emulated tests on NULL backend
|
||||
|
|
@ -679,20 +678,19 @@ jobs:
|
|||
run: |
|
||||
PYTHONPATH=. DEV=NULL::gfx1100 python extra/mmapeak/mmapeak.py
|
||||
PYTHONPATH=. DEV=NULL::gfx1201 python3 -m pytest -n=auto test/testextra/test_tk.py test/backend/test_asm_gemm.py
|
||||
- name: Run matmul on MOCKGPU
|
||||
- name: Run matmul on MOCKKFD
|
||||
run: |
|
||||
PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_asm_matmul.py
|
||||
PYTHONPATH="." DEV=AMD MOCKGPU=1 N=256 python3 extra/gemm/amd_copy_matmul.py
|
||||
PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_asm_matmul.py
|
||||
PYTHONPATH="." DEV=MOCKKFD+AMD N=256 python3 extra/gemm/amd_copy_matmul.py
|
||||
- name: Run LLVM test
|
||||
run: DEV=AMD:LLVM python test/device/test_amd_llvm.py
|
||||
run: DEV=MOCKKFD+AMD:LLVM python test/device/test_amd_llvm.py
|
||||
|
||||
testmockam:
|
||||
name: Linux (am)
|
||||
runs-on: ubuntu-24.04
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
DEV: PCI+AMD
|
||||
MOCKGPU: 1
|
||||
DEV: MOCKPCI+AMD
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v6
|
||||
|
|
@ -704,13 +702,13 @@ jobs:
|
|||
amd: 'true'
|
||||
- name: Run test_tiny on MOCKAM
|
||||
run: python test/test_tiny.py
|
||||
- name: Run test_tiny on MOCKAM USB
|
||||
run: GMMU=0 DEV=USB+AMD python test/test_tiny.py
|
||||
- name: Run test_hcq on MOCKAM
|
||||
- name: Run test_tiny on MOCKUSB
|
||||
run: GMMU=0 DEV=MOCKUSB+AMD python test/test_tiny.py
|
||||
- name: Run test_hcq on MOCKPCI
|
||||
run: python -m pytest test/device/test_hcq.py
|
||||
- name: Run disk copy tests on MOCKAM
|
||||
- name: Run disk copy tests on MOCKPCI
|
||||
run: python -m pytest test/unit/test_disk_tensor.py -k test_copy_from_disk
|
||||
- name: Run test_tiny on MOCKAM Remote
|
||||
- name: Run test_tiny on MOCKPCI Remote
|
||||
run: |
|
||||
python extra/remote/serve.py 6667 &
|
||||
sleep 2
|
||||
|
|
@ -728,8 +726,7 @@ jobs:
|
|||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 15
|
||||
env:
|
||||
DEV: AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
|
||||
MOCKGPU: 1
|
||||
DEV: MOCKKFD+AMD:${{ matrix.backend == 'amdllvm' && 'LLVM' || '' }}:${{ matrix.arch }}
|
||||
SKIP_SLOW_TEST: 1
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
|
|
@ -764,7 +761,6 @@ jobs:
|
|||
runs-on: ubuntu-22.04
|
||||
timeout-minutes: 20
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
FORWARD_ONLY: 1
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
|
|
@ -777,7 +773,7 @@ jobs:
|
|||
cuda: 'true'
|
||||
ocelot: 'true'
|
||||
- name: Set env
|
||||
run: printf "${{ matrix.backend == 'ptx' && 'DEV=CUDA:PTX' || matrix.backend == 'nv' && 'DEV=NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
|
||||
run: printf "${{ matrix.backend == 'ptx' && 'DEV=MOCK+CUDA:PTX' || matrix.backend == 'nv' && 'DEV=MOCKNVK+NV\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
|
||||
- name: Check Device.DEFAULT and print some source
|
||||
run: |
|
||||
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
|
||||
|
|
@ -862,22 +858,19 @@ jobs:
|
|||
run: DEV=METAL TRANSCENDENTAL=2 python -m pytest -n=auto test/backend/test_ops.py::TestOps::test_sin test/backend/test_ops.py::TestOps::test_cos test/backend/test_ops.py::TestOps::test_tan test/backend/test_ops.py::TestOps::test_exp test/backend/test_ops.py::TestOps::test_log --durations=20
|
||||
- name: Run pytest (amd)
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
DEV: AMD
|
||||
DEV: MOCKKFD+AMD
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
|
||||
- name: Run pytest (amd with llvm backend)
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
DEV: "AMD:LLVM"
|
||||
DEV: "MOCKKFD+AMD:LLVM"
|
||||
FORWARD_ONLY: 1
|
||||
run: |
|
||||
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
|
||||
- name: Run pytest (ptx)
|
||||
env:
|
||||
MOCKGPU: 1
|
||||
DEV: "NV:PTX"
|
||||
DEV: "MOCKNVK+NV:PTX"
|
||||
FORWARD_ONLY: 1
|
||||
# TODO: failing due to library loading error
|
||||
CAPTURE_PROCESS_REPLAY: 0
|
||||
|
|
|
|||
|
|
@ -1,9 +1,9 @@
|
|||
# tinygrad allows you to write kernels at many different abstractions levels.
|
||||
# This is for RDNA3, but if you don't have one you can run with the emulator
|
||||
# PYTHONPATH="." MOCKGPU=1 DEV=AMD
|
||||
# PYTHONPATH="." DEV=MOCKPCI+AMD
|
||||
|
||||
from tinygrad import Tensor, Context, GlobalCounters, UOp, Device
|
||||
from tinygrad.helpers import DEBUG, getenv
|
||||
from tinygrad.helpers import DEV, DEBUG, getenv
|
||||
from tinygrad.uop.ops import AxisType, KernelInfo, Ops
|
||||
from tinygrad.dtype import AddrSpace, dtypes
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import *
|
||||
|
|
@ -16,7 +16,7 @@ def eval_harness(name, tensor, fxn, check=None):
|
|||
print(f"computed in {GlobalCounters.time_sum_s*1000:.2f} ms, {(a.nbytes()/1e9)/GlobalCounters.time_sum_s:.2f} GB/s")
|
||||
return out
|
||||
|
||||
SZ = 256*1024 if getenv("MOCKGPU") else 1024*1024*1024
|
||||
SZ = 256*1024 if DEV.interface.startswith("MOCK") else 1024*1024*1024
|
||||
|
||||
def example_2_hip(a:Tensor, correct):
|
||||
GLOBALS = 1024
|
||||
|
|
|
|||
|
|
@ -37,8 +37,7 @@ dev.synchronize()
|
|||
'''
|
||||
|
||||
env = os.environ.copy()
|
||||
env["AMD"] = "1"
|
||||
env["MOCKGPU"] = "1"
|
||||
env["DEV"] = "MOCKKFD+AMD"
|
||||
env["HCQDEV_WAIT_TIMEOUT_MS"] = "10000"
|
||||
|
||||
st = time.perf_counter()
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Tests for SQTT encoder: verifies the emulator produces correct SQTT traces for known kernels.
|
||||
|
||||
Run with: DEV=AMD MOCKGPU=1 python -m pytest test/amd/test_sqtt_encoder.py -v
|
||||
Run with: DEV=MOCKKFD+AMD python -m pytest test/amd/test_sqtt_encoder.py -v
|
||||
"""
|
||||
import ctypes, unittest
|
||||
from tinygrad.helpers import Context
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
from tinygrad import Tensor, Device, dtypes, Context
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.helpers import getenv, system
|
||||
from tinygrad.helpers import getenv, system, DEV
|
||||
from extra.gemm.cdna_asm_gemm import asm_gemm
|
||||
from test.helpers import needs_second_gpu
|
||||
from examples.mlperf.models.flat_llama import FP8_DTYPE
|
||||
|
|
@ -131,7 +131,7 @@ class TestGemmLlama(unittest.TestCase):
|
|||
dtype = dtypes.bfloat16
|
||||
|
||||
def setUp(self):
|
||||
if not is_cdna4() or getenv("MOCKGPU"):
|
||||
if not is_cdna4() or DEV.interface.startswith("MOCK"):
|
||||
self.skipTest("very slow on non mi350x")
|
||||
|
||||
def test_empty(self): asm_gemm(Tensor.empty(N:=getenv("N", 4096), N, dtype=self.dtype), Tensor.empty(N, N, dtype=self.dtype)).realize()
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import unittest, operator, math
|
||||
from tinygrad import Context, Tensor, dtypes, Device
|
||||
from tinygrad.dtype import DType, truncate, fp8_to_float
|
||||
from tinygrad.helpers import CI, EMULATED_DTYPES, getenv
|
||||
from tinygrad.helpers import CI, EMULATED_DTYPES, DEV, getenv
|
||||
from tinygrad.tensor import _to_np_dtype
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.runtime.ops_python import from_storage_scalar
|
||||
|
|
@ -32,7 +32,8 @@ unary_operations = [(Tensor.exp, np.exp), (Tensor.log, np.log), (Tensor.sin, np.
|
|||
#binary_operations.append(operator.truediv)
|
||||
|
||||
# TODO: CI CUDA segfaults on sin, WEBGPU and NIR sines are not precise enough for large numbers
|
||||
if (getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}) or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer):
|
||||
if ((DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"})
|
||||
or Device.DEFAULT == "WEBGPU" or isinstance(Device[Device.DEFAULT].renderer, NIRRenderer)):
|
||||
unary_operations.remove((Tensor.sin, np.sin))
|
||||
unary_operations.remove((Tensor.cos, np.cos))
|
||||
|
||||
|
|
|
|||
|
|
@ -27,10 +27,10 @@ import numpy as np
|
|||
import torch
|
||||
from tinygrad import Tensor, dtypes, nn
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.helpers import getenv
|
||||
from tinygrad.helpers import DEV
|
||||
from tinygrad.renderer.nir import NIRRenderer
|
||||
|
||||
MOCKGPU = getenv("MOCKGPU")
|
||||
MOCKGPU = DEV.interface.startswith("MOCK")
|
||||
|
||||
class TestNaNEdgeCases(unittest.TestCase):
|
||||
# we don't need more of these. it's unclear if torch's behavior is desired here
|
||||
|
|
|
|||
|
|
@ -3,12 +3,12 @@ import unittest
|
|||
import torch
|
||||
import numpy as np
|
||||
|
||||
from tinygrad.helpers import getenv, CI
|
||||
from tinygrad.helpers import CI, DEV
|
||||
from tinygrad.tensor import Tensor
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.dtype import _from_torch_dtype, _to_torch_dtype
|
||||
|
||||
MOCKGPU = getenv("MOCKGPU")
|
||||
MOCKGPU = DEV.interface.startswith("MOCK")
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT not in ["METAL", "CUDA"] or MOCKGPU, f"no support on {Device.DEFAULT}")
|
||||
class TestInterop(unittest.TestCase):
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ from tinygrad.tensor import Tensor
|
|||
from tinygrad.engine.jit import TinyJit, JitError, GraphRunner, MultiGraphRunner, graph_class
|
||||
from tinygrad.engine.realize import CompiledRunner, BufferCopy, BufferXfer
|
||||
from tinygrad.device import Device
|
||||
from tinygrad.helpers import Context, JIT, GlobalCounters, getenv
|
||||
from tinygrad.helpers import Context, JIT, DEV, GlobalCounters
|
||||
from tinygrad.dtype import dtypes
|
||||
from extra.models.unet import ResBlock
|
||||
|
||||
|
|
@ -812,7 +812,7 @@ class TestJitGraphSplit(unittest.TestCase):
|
|||
hcqgraph=[self.ji_graph(6)])
|
||||
|
||||
@unittest.skip("this fails if you don't have SDMA or are using AMD_DISABLE_SDMA=1")
|
||||
@unittest.skipIf(getenv("MOCKGPU"), "MockGPU does not support parallel copies")
|
||||
@unittest.skipIf(DEV.interface.startswith("MOCK"), "MockGPU does not support parallel copies")
|
||||
def test_jit_multidev_copy(self):
|
||||
if Device.DEFAULT in {"CPU"}: raise unittest.SkipTest("CPU/LLVM is not a valid default device for this test (zero-copies)")
|
||||
|
||||
|
|
|
|||
|
|
@ -7,12 +7,12 @@ from tinygrad.uop.ops import UOp, Ops, GroupOp, AxisType
|
|||
from tinygrad.device import Device, Buffer, is_dtype_supported
|
||||
from tinygrad.tensor import Tensor, _to_np_dtype
|
||||
from tinygrad.engine.realize import run_schedule, CompiledRunner, get_program
|
||||
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, getenv
|
||||
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT, DEV
|
||||
from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
from tinygrad.renderer.cstyle import CUDARenderer
|
||||
from test.helpers import replace_opts
|
||||
MOCKGPU = getenv("MOCKGPU")
|
||||
MOCKGPU = DEV.interface.startswith("MOCK")
|
||||
|
||||
from tinygrad.uop.ops import print_uops # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
|
|
|
|||
|
|
@ -94,7 +94,7 @@ def prepare_test_op(low, high, shps, vals, forward_only=False):
|
|||
class TestOps(unittest.TestCase):
|
||||
|
||||
def helper_test_exception(self, shps, torch_fxn, tinygrad_fxn=None, expected=None, forward_only=False, exact=False, vals=None, low=-1.5, high=1.5):
|
||||
if getenv("MOCKGPU") and Device.DEFAULT == "NV": self.skipTest('helper_test_exception fails in CI CUDA')
|
||||
if DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV": self.skipTest('helper_test_exception fails in CI CUDA')
|
||||
ts, tst = prepare_test_op(low, high, shps, vals, forward_only)
|
||||
if tinygrad_fxn is None:
|
||||
tinygrad_fxn = torch_fxn
|
||||
|
|
@ -877,7 +877,7 @@ class TestOps(unittest.TestCase):
|
|||
helper_test_op([(45,65)], lambda x: x.sin())
|
||||
helper_test_op([()], lambda x: x.sin())
|
||||
# works on real CUDA but not CI
|
||||
if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
|
||||
if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
|
||||
helper_test_op(None, lambda x: x.sin(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
|
||||
helper_test_op(None, lambda x: x.sin(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
|
||||
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
|
||||
|
|
@ -886,7 +886,7 @@ class TestOps(unittest.TestCase):
|
|||
def test_cos(self):
|
||||
helper_test_op([(45,65)], lambda x: x.cos())
|
||||
helper_test_op([()], lambda x: x.cos())
|
||||
if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
|
||||
if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
|
||||
helper_test_op(None, lambda x: x.cos(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
|
||||
helper_test_op(None, lambda x: x.cos(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
|
||||
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
|
||||
|
|
@ -897,7 +897,7 @@ class TestOps(unittest.TestCase):
|
|||
helper_test_op([(45,65)], lambda x: x.tan(), low=-1.5, high=1.5)
|
||||
helper_test_op([(45,65)], lambda x: x.tan(), low=-5, high=5)
|
||||
helper_test_op([()], lambda x: x.tan())
|
||||
if not ((getenv("MOCKGPU") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
|
||||
if not ((DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV") or Device.DEFAULT == "WEBGPU"):
|
||||
helper_test_op(None, lambda x: x.tan(), vals=[[math.nan, math.inf, -math.inf, 0.0]])
|
||||
helper_test_op(None, lambda x: x.tan(), vals=[[1e1, 1e2, 1e3, 1e4, 1e5, 1e6, -1e1, -1e2, -1e3, -1e4, -1e5, -1e6]],
|
||||
atol=3e-3, rtol=3e-3, grad_atol=3e-3, grad_rtol=3e-3)
|
||||
|
|
@ -3310,7 +3310,7 @@ class TestOps(unittest.TestCase):
|
|||
helper_test_op([(32,10)], lambda x: x.masked_fill((x>0.1).detach(), -math.inf))
|
||||
helper_test_op([(32,10)], lambda x: x.masked_fill((x<0.1).detach(), -math.inf))
|
||||
|
||||
@unittest.skipIf((getenv("MOCKGPU") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
|
||||
@unittest.skipIf((DEV.interface.startswith("MOCK") or Device.DEFAULT == "PYTHON"), "very slow on MOCKGPU because reduce does not fold")
|
||||
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "webgpu runtime issue")
|
||||
@unittest.skipIf(Device.DEFAULT == "QCOM", "QCOM fails with: Resource deadlock avoided")
|
||||
def test_masked_select(self):
|
||||
|
|
|
|||
|
|
@ -1,11 +1,11 @@
|
|||
import unittest, struct, contextlib, statistics, gc
|
||||
from tinygrad import Device, Tensor, dtypes, TinyJit
|
||||
from tinygrad.helpers import CI, getenv, Context, ProfileRangeEvent, cpu_profile, cpu_events, ProfilePointEvent, dedup
|
||||
from tinygrad.helpers import CI, DEV, Context, ProfileRangeEvent, cpu_profile, cpu_events, ProfilePointEvent, dedup
|
||||
from tinygrad.device import Buffer, BufferSpec, Compiled, ProfileDeviceEvent, ProfileGraphEvent
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled
|
||||
from tinygrad.engine.realize import get_runner
|
||||
|
||||
MOCKGPU = getenv("MOCKGPU")
|
||||
MOCKGPU = DEV.interface.startswith("MOCK")
|
||||
def _dev_base(d):
|
||||
p = d.split(":")
|
||||
return p[0] if len(p) < 2 or not p[1].isdigit() else f"{p[0]}:{p[1]}"
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import unittest
|
||||
from tinygrad import Device, dtypes, Tensor
|
||||
from tinygrad.device import Buffer
|
||||
from tinygrad.helpers import Context, getenv
|
||||
from tinygrad.helpers import Context, DEV
|
||||
from test.helpers import needs_second_gpu
|
||||
|
||||
@unittest.skipUnless(hasattr(Device[Device.DEFAULT].allocator, "_offset"), "subbuffer not supported")
|
||||
|
|
@ -42,7 +42,7 @@ class TestSubBuffer(unittest.TestCase):
|
|||
assert out == [102, 103]
|
||||
|
||||
@needs_second_gpu
|
||||
@unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or getenv("MOCKGPU"), "only NV, AMD, CUDA")
|
||||
@unittest.skipIf(Device.DEFAULT not in {"CUDA", "NV", "AMD"} or DEV.interface.startswith("MOCK"), "only NV, AMD, CUDA")
|
||||
def test_subbuffer_transfer(self):
|
||||
t = Tensor.arange(0, 10, dtype=dtypes.uint8).realize()
|
||||
vt = t[2:5].contiguous().realize()
|
||||
|
|
|
|||
|
|
@ -14,7 +14,7 @@ settings.load_profile("my_profile")
|
|||
|
||||
class TestTranscendentalMath(unittest.TestCase):
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.float64), f"no float64 on {Device.DEFAULT}")
|
||||
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
|
||||
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
|
||||
@given(ht.float64, strat.sampled_from([(Tensor.exp, np.exp), (Tensor.log, np.log), (Tensor.sin, np.sin)]))
|
||||
def test_float64(self, x, op):
|
||||
if op[0] == Tensor.sin:
|
||||
|
|
@ -25,7 +25,7 @@ class TestTranscendentalMath(unittest.TestCase):
|
|||
op[1](np.array([x], dtype=_to_np_dtype(dtypes.float64))),
|
||||
atol=3e-2, rtol=1e-5) # sin can have bigger atol for very big x
|
||||
|
||||
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
|
||||
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}, "crashed")
|
||||
@given(ht.float32, strat.sampled_from([(Tensor.exp, np.exp),(Tensor.log, np.log)] +
|
||||
([(Tensor.sin, np.sin)] if is_dtype_supported(dtypes.ulong) else [])))
|
||||
def test_float32(self, x, op):
|
||||
|
|
@ -66,7 +66,7 @@ class TestFromFuzzer(unittest.TestCase):
|
|||
if not is_dtype_supported(dtype): return
|
||||
if dtype == dtypes.float64:
|
||||
# crashes in CI CUDA
|
||||
if getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}: return
|
||||
if DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}: return
|
||||
def _test_value(n: float, unit: float=1.0):
|
||||
next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
|
||||
ulp = next_float - 1.0
|
||||
|
|
@ -88,7 +88,7 @@ class TestFromFuzzer(unittest.TestCase):
|
|||
if not is_dtype_supported(dtype): return
|
||||
if dtype == dtypes.float64:
|
||||
# crashes in CI CUDA
|
||||
if getenv("MOCKGPU") and Device.DEFAULT in {"NV", "CUDA"}: return
|
||||
if DEV.interface.startswith("MOCK") and Device.DEFAULT in {"NV", "CUDA"}: return
|
||||
def _test_value(n: float, unit: float=1.0):
|
||||
next_float = np.nextafter(1.0, 2.0, dtype=_to_np_dtype(dtype))
|
||||
ulp = next_float - 1.0
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
import unittest, ctypes, struct, os, random, numpy as np, time
|
||||
from tinygrad import Device, Tensor, dtypes
|
||||
from tinygrad.helpers import getenv, mv_address, DEBUG, DEV
|
||||
from tinygrad.helpers import mv_address, DEBUG, DEV
|
||||
from test.helpers import slow, replace_opts
|
||||
from tinygrad.device import Buffer, BufferSpec
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQBuffer
|
||||
|
|
@ -10,7 +10,7 @@ from tinygrad.engine.realize import get_runner, CompiledRunner, get_program
|
|||
from tinygrad.codegen.opt import Opt, OptOps
|
||||
from tinygrad import Variable
|
||||
|
||||
MOCKGPU = getenv("MOCKGPU")
|
||||
MOCKGPU = DEV.interface.startswith("MOCK")
|
||||
|
||||
@unittest.skipUnless(issubclass(type(Device[Device.DEFAULT]), HCQCompiled), "HCQ device required to run")
|
||||
class TestHCQ(unittest.TestCase):
|
||||
|
|
@ -76,7 +76,7 @@ class TestHCQ(unittest.TestCase):
|
|||
TestHCQ.d0.timeline_signal.wait(TestHCQ.d0.timeline_value)
|
||||
TestHCQ.d0.timeline_value += 1
|
||||
|
||||
@unittest.skipIf(Device.DEFAULT in {"CPU"} or (DEV.interface == "PCI" and DEV.device == "AMD"), "Can't handle async update on CPU/MOCKAM device")
|
||||
@unittest.skipIf(Device.DEFAULT == "CPU" or (DEV.interface == "MOCKPCI" and DEV.device == "AMD"), "Can't handle async update on CPU/MOCKPCI device")
|
||||
def test_wait_late_set(self):
|
||||
for queue_type in [TestHCQ.d0.hw_compute_queue_t, TestHCQ.d0.hw_copy_queue_t]:
|
||||
if queue_type is None: continue
|
||||
|
|
@ -575,7 +575,7 @@ class TestHCQ(unittest.TestCase):
|
|||
|
||||
np.testing.assert_equal(cpu_buffer.numpy(), local_buf.numpy(), "failed")
|
||||
|
||||
@unittest.skipUnless(MOCKGPU and not (DEV.device == "AMD" and DEV.interface == "PCI"), "Emulate this on MOCKGPU to check the path in CI")
|
||||
@unittest.skipUnless(MOCKGPU and not (DEV.device == "AMD" and DEV.interface == "MOCKPCI"), "Emulate this on MOCKGPU to check the path in CI")
|
||||
def test_on_device_hang(self):
|
||||
if not hasattr(self.d0, 'on_device_hang'): self.skipTest("device does not have on_device_hang")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
# ruff: noqa: F405
|
||||
import unittest, subprocess, os
|
||||
from tinygrad.helpers import DEV
|
||||
from tinygrad.runtime.autogen.amd.rdna3.ins import * # noqa: F403
|
||||
from tinygrad.renderer.amd.dsl import s, v, Inst, NULL
|
||||
|
||||
|
|
@ -27,7 +28,7 @@ _ILLEGAL_INST_ASM = ".text\n.globl test\n.p2align 8\n.type test,@function\ntest:
|
|||
".rodata\n.p2align 6\n.amdhsa_kernel test\n.amdhsa_next_free_vgpr 8\n.amdhsa_next_free_sgpr 8\n" \
|
||||
".amdhsa_wavefront_size32 1\n.amdhsa_user_sgpr_kernarg_segment_ptr 1\n.amdhsa_kernarg_size 8\n.end_amdhsa_kernel"
|
||||
|
||||
@unittest.skipIf(os.environ.get("AMD") != "1" or os.environ.get("MOCKGPU") == "1", "AMD with AM driver required")
|
||||
@unittest.skipIf(DEV.device != "AMD" or not DEV.interface.startswith("MOCK"), "AMD with AM driver required")
|
||||
class TestAMFaultRecovery(unittest.TestCase):
|
||||
def _run_kernel(self, insts: list[Inst]) -> subprocess.CompletedProcess: return _run_asm(assemble_kernel(insts))
|
||||
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ Test with `pytest -n12 test/amd/`
|
|||
`DEV=AMD:LLVM pytest -n12 test/amd/`
|
||||
|
||||
* dsl.py -- helpers for the autogen instruction classes in `__init__.py`. should be standalone with init
|
||||
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=AMD MOCKGPU=1`
|
||||
* test/mockgpu/amd/emu.py -- an emulator for RDNA that runs in tinygrad with `DEV=MOCK{KFD|KFD|USB}+AMD`
|
||||
* generate.py -- extract assembly format + instruction pseudocode from AMD XML + PDF
|
||||
* test/mockgpu/amd/pcode.py -- pseudocode to UOp transformation
|
||||
* sqtt.py -- SQTT parser
|
||||
|
|
@ -20,18 +20,18 @@ test_llvm.py tests asm/disasm on the LLVM tests, confirming it behaves the same
|
|||
|
||||
tinygrad's dtype tests should pass with and without LLVM. they run in about 12 seconds.
|
||||
|
||||
`DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=MOCKKFD+AMD pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
`DEV=MOCKKFD+AMD:LLVM pytest -n=12 test/backend/test_dtype_alu.py test/backend/test_dtype.py`
|
||||
|
||||
The ops tests also pass, but they are very slow, so you should run them one at a time.
|
||||
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=AMD:LLVM MOCKGPU=1 pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=MOCKKFD+AMD pytest -n=12 test/backend/test_ops.py`
|
||||
`SKIP_SLOW_TEST=1 DEV=NOCKKFD+AMD:LLVM pytest -n=12 test/backend/test_ops.py`
|
||||
|
||||
When something is caught by main tinygrad tests, a local regression test should be added to `test/amd`.
|
||||
While working with tinygrad, you can dump the assembly with `DEBUG=7`. These tests all pass on real hardware
|
||||
If a test is failing with `DEV=AMD MOCKGPU=1` it's because an instruction is emulated incorrectly.
|
||||
You can test without `MOCKGPU=1` to test on real hardware, if it works on real hardware there's a bug in the emulator.
|
||||
If a test is failing with `DEV=MOCKKFD+AMD` it's because an instruction is emulated incorrectly.
|
||||
You can test with just `DEV=AMD` to test on real hardware, if it works on real hardware there's a bug in the emulator.
|
||||
IMPORTANT: if a test is failing in the emulator, it's an instruction bug. Use DEBUG=7, get the instructions, and debug.
|
||||
|
||||
Currently, only RDNA3 is well supported, but when finished, this will support RDNA3+RDNA4+CDNA in ~3000 lines.
|
||||
|
|
|
|||
|
|
@ -11,8 +11,8 @@ libc = ctypes.CDLL(ctypes.util.find_library("c"))
|
|||
libc.mmap.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int, ctypes.c_int, ctypes.c_int, ctypes.c_long]
|
||||
libc.mmap.restype = ctypes.c_void_p
|
||||
|
||||
_amd_iface = DEV.target("AMD").interface
|
||||
drivers = [NVDriver(), AMDriver() if _amd_iface == "PCI" else (AMUSBDriver() if _amd_iface == "USB" else AMDDriver())]
|
||||
drivers = [cls() for t in DEV.value if (cls:={"MOCKPCI+AMD": AMDriver, "MOCKKFD+AMD": AMDDriver, "MOCKUSB+AMD": AMUSBDriver,
|
||||
"MOCKNVK+NV": NVDriver}.get(f"{t.interface}+{t.device}"))]
|
||||
tracked_fds = {}
|
||||
|
||||
original_memoryview = builtins.memoryview
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ import numpy as np
|
|||
np.set_printoptions(linewidth=160)
|
||||
from tinygrad import Tensor, Device, GlobalCounters, TinyJit
|
||||
from tinygrad.nn import Conv2d
|
||||
from tinygrad.helpers import colorize_float, getenv, CI
|
||||
from tinygrad.helpers import colorize_float, getenv, CI, DEV
|
||||
|
||||
IN_CHANS = [int(x) for x in getenv("IN_CHANS", "4,16,64").split(",")]
|
||||
|
||||
|
|
@ -113,7 +113,7 @@ def helper_test_conv(bs, in_chans, out_chans, kernel_size, img_size_y, img_size_
|
|||
helper_test_generic(f"conv bs:{bs:3d} chans:{in_chans:3d} -> {out_chans:3d} k:{kernel_size}", f1, (torch_dat,), TinyJit(f2), (tiny_dat,))
|
||||
|
||||
@unittest.skipIf(getenv("BIG") == 0, "no big tests")
|
||||
@unittest.skipIf(getenv("MOCKGPU"), "no MOCKGPUs")
|
||||
@unittest.skipIf(DEV.interface.startswith("MOCK"), "no MOCKGPUs")
|
||||
class TestBigSpeed(unittest.TestCase):
|
||||
def test_add(self):
|
||||
def f(a, b): return a+b
|
||||
|
|
@ -134,7 +134,7 @@ class TestBigSpeed(unittest.TestCase):
|
|||
def test_matvec_16384_4096(self): helper_test_matvec('matvec_16384_4096', 16384, 4096)
|
||||
|
||||
@unittest.skipIf(getenv("BIG") == 1, "only big tests")
|
||||
@unittest.skipIf(getenv("MOCKGPU"), "no MOCKGPUs")
|
||||
@unittest.skipIf(DEV.interface.startswith("MOCK"), "no MOCKGPUs")
|
||||
class TestSpeed(unittest.TestCase):
|
||||
def test_sub(self):
|
||||
def f(a, b): return a-b
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
from tinygrad.helpers import getenv
|
||||
from tinygrad.helpers import DEV
|
||||
import unittest, importlib
|
||||
|
||||
@unittest.skipUnless(getenv("MOCKGPU"), 'Testing mockgpu')
|
||||
@unittest.skipUnless(DEV.interface.startswith("MOCK"), 'Testing mockgpu')
|
||||
class TestMockGPU(unittest.TestCase):
|
||||
# https://github.com/tinygrad/tinygrad/pull/7627
|
||||
def test_import_typing_extensions(self):
|
||||
|
|
|
|||
|
|
@ -1,13 +1,14 @@
|
|||
from typing_extensions import Callable
|
||||
import hashlib, random, unittest
|
||||
from tinygrad import Tensor, Device, getenv, dtypes
|
||||
from tinygrad import Tensor, Device, dtypes
|
||||
from tinygrad.helpers import DEV
|
||||
from test.helpers import slow
|
||||
from tinygrad.device import is_dtype_supported
|
||||
from tinygrad.uop.ops import UOp
|
||||
from tinygrad.engine.jit import TinyJit
|
||||
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
|
||||
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
|
||||
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in NV CI")
|
||||
class TestHashing(unittest.TestCase):
|
||||
def _python_hash_1mb(self, data:bytes):
|
||||
chunks = [data[i:i+4096] for i in range(0, len(data), 4096)]
|
||||
|
|
@ -21,7 +22,7 @@ class TestHashing(unittest.TestCase):
|
|||
self.assertEqual(bytes(out.data()), expected)
|
||||
|
||||
@unittest.skipUnless(is_dtype_supported(dtypes.uint8) and is_dtype_supported(dtypes.uint64), "Device must support uint8 and uint64")
|
||||
@unittest.skipIf(getenv("MOCKGPU") and Device.DEFAULT == "NV", "crashes in NV CI")
|
||||
@unittest.skipIf(DEV.interface.startswith("MOCK") and Device.DEFAULT == "NV", "crashes in NV CI")
|
||||
class TestKeccak(unittest.TestCase):
|
||||
def setUp(self) -> None: random.seed(1337)
|
||||
|
||||
|
|
|
|||
|
|
@ -394,9 +394,8 @@ class CUDARenderer(CStyleLanguage):
|
|||
def __init__(self, target:Target, use_nvcc=False):
|
||||
super().__init__(target)
|
||||
from tinygrad.runtime.support.compiler_cuda import NVRTCCompiler, NVCCCompiler
|
||||
from tinygrad.runtime.support.hcq import MOCKGPU
|
||||
dev, arch = target.device, target.arch
|
||||
self.compiler = (NVCCCompiler if use_nvcc else NVRTCCompiler)(arch, ptx=bool(MOCKGPU) or dev == "CUDA", cache_key=dev.lower())
|
||||
iface, dev, arch = target.interface, target.device, target.arch
|
||||
self.compiler = (NVCCCompiler if use_nvcc else NVRTCCompiler)(arch, ptx=iface.startswith("MOCK") or dev == "CUDA", cache_key=dev.lower())
|
||||
self.tensor_cores = tc.get_cuda(arch)
|
||||
|
||||
# language options
|
||||
|
|
|
|||
|
|
@ -145,8 +145,7 @@ class PTXRenderer(Renderer):
|
|||
def __init__(self, target:Target):
|
||||
super().__init__(target)
|
||||
from tinygrad.runtime.support.compiler_cuda import NVPTXCompiler, PTXCompiler
|
||||
from tinygrad.runtime.support.hcq import MOCKGPU
|
||||
self.compiler = (PTXCompiler if bool(MOCKGPU) or target.device == "CUDA" else NVPTXCompiler)(target.arch)
|
||||
self.compiler = (PTXCompiler if target.interface.startswith("MOCK") or target.device == "CUDA" else NVPTXCompiler)(target.arch)
|
||||
self.tensor_cores = PTXRenderer.tc_sm80 if (ver:=int(target.arch[3:])) >= 80 else tc.cuda_sm75 if ver >= 75 else []
|
||||
|
||||
# language options
|
||||
|
|
|
|||
|
|
@ -325,5 +325,5 @@ class HCQGraph(MultiGraphRunner):
|
|||
if new_call.src[0].op is Ops.COPY:
|
||||
# MOCKGPU is not supported, since it can't execute commands in parallel
|
||||
is_xfer = len(set(type(d) for d in all_devs)) == 1 and hasattr(alc:=all_devs[0].allocator, '_transfer') and alc.supports_transfer
|
||||
return is_xfer or (all_devs[0].hw_copy_queue_t is not None and not getenv("MOCKGPU"))
|
||||
return is_xfer or (all_devs[0].hw_copy_queue_t is not None and not getattr(all_devs[0], 'iface', None).__class__.__name__.startswith("MOCK"))
|
||||
return _unwrap_beam(new_call.src[0]).op in (Ops.SINK, Ops.PROGRAM)
|
||||
|
|
|
|||
|
|
@ -941,13 +941,15 @@ class USBIface(PCIIface):
|
|||
|
||||
def sleep(self, timeout): pass
|
||||
|
||||
def mock_iface(iface): return type(f"MOCK{iface.__name__}", (iface,), {})
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
def is_am(self) -> bool: return isinstance(self.iface, (PCIIface, USBIface))
|
||||
def is_usb(self) -> bool: return isinstance(self.iface, USBIface)
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
self.iface = self._select_iface(KFDIface, PCIIface, USBIface)
|
||||
self.iface = self._select_iface(KFDIface, PCIIface, USBIface, mock_iface(KFDIface), mock_iface(PCIIface), mock_iface(USBIface))
|
||||
self.target:tuple[int, ...] = ((trgt:=self.iface.props['gfx_target_version']) // 10000, (trgt // 100) % 100, trgt % 100)
|
||||
self.arch = "gfx%d%x%x" % self.target
|
||||
if self.target < (9,4,2) or self.target >= (13,0,0): raise RuntimeError(f"Unsupported arch: {self.arch}")
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
from __future__ import annotations
|
||||
import ctypes, functools
|
||||
from tinygrad.helpers import DEBUG, getenv, mv_address, suppress_finalizing
|
||||
from tinygrad.helpers import DEBUG, DEV, getenv, mv_address, suppress_finalizing
|
||||
from tinygrad.device import Compiled, BufferSpec, LRUAllocator
|
||||
from tinygrad.renderer.cstyle import CUDARenderer, NVCCRenderer
|
||||
from tinygrad.renderer.ptx import PTXRenderer
|
||||
|
|
@ -8,7 +8,7 @@ from tinygrad.runtime.autogen import cuda
|
|||
from tinygrad.runtime.support.compiler_cuda import pretty_ptx
|
||||
from tinygrad.runtime.support.c import init_c_struct_t, init_c_var
|
||||
if getenv("IOCTL"): import extra.nv_gpu_driver.nv_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
|
||||
if (MOCKGPU:=DEV.target("CUDA").interface == "MOCK"): from test.mockgpu.cuda import cuda # type: ignore # pylint: disable=reimported
|
||||
|
||||
def check(status):
|
||||
if status != 0:
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ assert sys.platform != 'win32'
|
|||
from typing import cast
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQProgram, HCQSignal, BumpAllocator
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, MOCKGPU, hcq_filter_visible_devices, hcq_profile
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface, FileIOInterface, hcq_filter_visible_devices, hcq_profile
|
||||
from tinygrad.uop.ops import sint
|
||||
from tinygrad.device import Compiled, BufferSpec
|
||||
from tinygrad.helpers import getenv, mv_address, round_up, data64, data64_le, prod, OSX, hi32, lo32, PROFILE, ContextVar, VIZ, ProfileEvent
|
||||
|
|
@ -240,7 +240,7 @@ class NVVideoQueue(NVCommandQueue):
|
|||
|
||||
class NVArgsState(CLikeArgsState):
|
||||
def __init__(self, buf:HCQBuffer, prg:NVProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
||||
if MOCKGPU: prg.cbuf_0[80:82] = [len(bufs), len(vals)]
|
||||
if isinstance(prg.dev.iface, MOCKNVKIface): prg.cbuf_0[80:82] = [len(bufs), len(vals)]
|
||||
super().__init__(buf, prg, bufs, vals=vals, prefix=prg.cbuf_0 or None)
|
||||
|
||||
class NVProgram(HCQProgram):
|
||||
|
|
@ -251,14 +251,14 @@ class NVProgram(HCQProgram):
|
|||
if (NAK:=isinstance(dev.renderer, NAKRenderer)):
|
||||
image, self.cbuf_0 = memoryview(bytearray(lib[ctypes.sizeof(info:=mesa.struct_nak_shader_info.from_buffer_copy(lib)):])), []
|
||||
self.regs_usage, self.shmem_usage, self.lcmem_usage = info.num_gprs, round_up(info.cs.smem_size, 128), round_up(info.slm_size, 16)
|
||||
elif MOCKGPU: image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
|
||||
elif isinstance(dev.iface, MOCKNVKIface): image, sections, relocs = memoryview(bytearray(lib) + b'\x00' * (4 - len(lib)%4)).cast("I"), [], [] # type: ignore
|
||||
else: image, sections, relocs = elf_loader(self.lib, force_section_align=128)
|
||||
# NOTE: Ensure at least 4KB of space after the program to mitigate prefetch memory faults.
|
||||
self.lib_gpu = self.dev.allocator.alloc(round_up((prog_sz:=image.nbytes), 0x1000) + 0x1000, buf_spec:=BufferSpec(nolru=True))
|
||||
prog_addr = self.lib_gpu.va_addr
|
||||
if not NAK:
|
||||
# For MOCKGPU, the lib is PTX code, so some values are emulated.
|
||||
self.regs_usage, self.shmem_usage, self.lcmem_usage, cbuf0_size = 0, 0x400, 0x240, 0 if not MOCKGPU else 0x160
|
||||
self.regs_usage, self.shmem_usage, self.lcmem_usage, cbuf0_size = 0, 0x400, 0x240, 0x160 if isinstance(dev.iface, MOCKNVKIface) else 0
|
||||
for sh in sections: # pylint: disable=possibly-used-before-assignment
|
||||
if sh.name == f".nv.shared.{self.name}": self.shmem_usage = round_up(0x400 + sh.header.sh_size, 128)
|
||||
if sh.name == f".text.{self.name}": prog_addr, prog_sz = self.lib_gpu.va_addr+sh.header.sh_addr, sh.header.sh_size
|
||||
|
|
@ -472,7 +472,8 @@ class NVKIface:
|
|||
|
||||
def alloc(self, size:int, host=False, uncached=False, cpu_access=False, contiguous=False, map_flags=0, cpu_addr=None, **kwargs) -> HCQBuffer:
|
||||
# Uncached memory is "system". Use huge pages only for gpu memory.
|
||||
page_size = mmap.PAGESIZE if uncached or host else ((2 << 20) if size >= (8 << 20) else (mmap.PAGESIZE if MOCKGPU else 4 << 10))
|
||||
page_size = mmap.PAGESIZE if uncached or host else ((2 << 20) if size >= (8 << 20) else (mmap.PAGESIZE if isinstance(self, MOCKNVKIface) else
|
||||
4 << 10))
|
||||
size = round_up(size, page_size)
|
||||
va_addr = self._alloc_gpu_vaddr(size, alignment=page_size, force_low=cpu_access) if (alloced:=cpu_addr is None) else cpu_addr
|
||||
|
||||
|
|
@ -575,12 +576,14 @@ class PCIIface(PCIIfaceBase):
|
|||
for _ in self.dev_impl.gsp.stat_q.read_resp(): pass
|
||||
if self.dev_impl.is_err_state: raise RuntimeError("Device fault detected")
|
||||
|
||||
class MOCKNVKIface(NVKIface): pass
|
||||
|
||||
class NVDevice(HCQCompiled[NVSignal]):
|
||||
def is_nvd(self) -> bool: return isinstance(self.iface, PCIIface)
|
||||
|
||||
def __init__(self, device:str=""):
|
||||
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
||||
self.iface = self._select_iface(NVKIface, PCIIface)
|
||||
self.iface = self._select_iface(NVKIface, PCIIface, MOCKNVKIface)
|
||||
|
||||
device_params = nv_gpu.NV0080_ALLOC_PARAMETERS(deviceId=self.iface.gpu_instance, hClientShare=self.iface.root,
|
||||
vaMode=nv_gpu.NV_DEVICE_ALLOCATION_VAMODE_OPTIONAL_MULTIPLE_VASPACES)
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ class FileIOInterface:
|
|||
@staticmethod
|
||||
def eventfd(initval, flags=None): return FileIOInterface(fd=os.eventfd(initval, flags)) # type: ignore[attr-defined]
|
||||
|
||||
if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface # noqa: F401 # pylint: disable=unused-import
|
||||
if DEV.interface.startswith("MOCK"): from test.mockgpu.mockgpu import MockFileIOInterface as FileIOInterface # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
# **************** for HCQ Compatible Devices ****************
|
||||
|
||||
|
|
@ -491,6 +491,7 @@ class HCQCompiled(Compiled, Generic[SignalType]):
|
|||
f"{k}={v} is deprecated, use DEV={replace(DEV.target(type(self).__name__[:-6]), interface=v)} instead"
|
||||
t = DEV.target(dev:=type(self).__name__[:-6])
|
||||
filtered = select_by_name(ifaces, lambda i: i.__name__[:-5], t.interface, f"{dev} has no interface {t.interface!r}")
|
||||
filtered = [i for i in filtered if t.interface.startswith("MOCK") or not i.__name__[:-5].startswith("MOCK")] # never fallback to mock ifaces
|
||||
return select_first_inited([functools.partial(cast(Callable, iface), self, self.device_id) for iface in filtered],
|
||||
f"No interface for {dev}:{self.device_id} is available")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import ctypes, struct, dataclasses, array, itertools, time
|
||||
from typing import Sequence
|
||||
from tinygrad.runtime.autogen import libusb
|
||||
from tinygrad.helpers import DEBUG, to_mv, round_up, OSX, getenv, ceildiv
|
||||
from tinygrad.helpers import DEBUG, DEV, to_mv, round_up, OSX, getenv, ceildiv
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface
|
||||
|
||||
def alloc_cbuffer(sz:int) -> tuple[ctypes.Array, memoryview]: return (buf:=(ctypes.c_ubyte * sz)()), to_mv(ctypes.addressof(buf), sz)
|
||||
|
|
@ -449,4 +449,4 @@ class USBMMIOInterface(MMIOInterface):
|
|||
_, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt))
|
||||
self.usb.pcie_mem_write(self.addr+off, [int.from_bytes(data[i:i+acc_sz], "little") for i in range(0, len(data), acc_sz)], acc_sz)
|
||||
|
||||
if getenv("MOCKGPU"): from test.mockgpu.usb import MockUSB3 as USB3 # type: ignore # noqa: F811
|
||||
if DEV.interface.startswith("MOCK"): from test.mockgpu.usb import MockUSB3 as USB3 # type: ignore # noqa: F811
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue