* start gpu

* progress

* fixes

* read correct

* libusb

* libusb works

* support asm24

* hmm

* one access file

* fix extra

* start AMBar

* works on am

* back to usb

* patch fw

* full fast write into a bar

* ugh, minus one gpus, next please

* mute libusb for now

* usb for asm24

* 63

* hmm

* ops

* rescan

* and gpu shoudl be there

* enumerate them?

* usbgpu bus 4, 100% reliable (draft)

* lil

* works

* comments

* add DEBUG

* cleaner

* simplest

* Revert "simplest"

This reverts commit 1d00354c16.

* Revert "cleaner"

This reverts commit c5662de956.

* assert we find gpu

* that's simpler

* this back

* simpler?

* correcT

* work

* nonsense

* works with more checks

* this works

* the 6s in the right place

* reliable now

* fix after reboot

* set config

* 1s timeouts

* close to fw loading

* streams

* usbhub works

* endpoints

* fix

* want to test tiny10

* move to tiny 10

* fix gpu

* ugly speed

* smth

* mostly broken, but signals and dmas

* do not reset gpu every time

* changes to run kernels

* ugh, not working

* t10

* pg and sc files

* some prog

* um?

* somehow it works

* patched for 24

* some tries

* minimal

* moving

* back to working

* so sloooooow

* move to controller

* usb.py rewrite

* rework

* cleaner 1

* cleaner 2

* cleaner 3

* new abstractions

* aft merge

* init controller

* cleaner 4

* cleaner 5

* patcher + tiny changes

* ignore that

* cleaner 6

* after rebase

* cleaner 7

* bring it back

* start linter war

* linter 2

* autogen was missing

* fix autogen

* typing

* better?

* mypy

* extra/legacy rename and cleaner

* shuffle

* better printing

* tiny changes and tests

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
nimlgen 2025-05-01 18:03:47 +03:00 committed by GitHub
commit 30bd6a619f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 2368 additions and 14 deletions

View file

@ -528,6 +528,19 @@ generate_webgpu() {
python3 -c "import tinygrad.runtime.autogen.webgpu"
}
generate_libusb() {
clang2py -k cdefstum \
/usr/include/libusb-1.0/libusb.h \
-o $BASE/libusb.py
fixup $BASE/libusb.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/libusb.py
sed -i "s/FIXME_STUB/libusb/g" "$BASE/libusb.py"
sed -i "s/libusb_le16_to_cpu = libusb_cpu_to_le16//g" "$BASE/libusb.py"
sed -i "s/FunctionFactoryStub()/None if (lib_path:=os.getenv('LIBUSB_PATH', ctypes.util.find_library('usb-1.0'))) is None else ctypes.CDLL(lib_path)/g" "$BASE/libusb.py"
python3 -c "import tinygrad.runtime.autogen.libusb"
}
if [ "$1" == "opencl" ]; then generate_opencl
elif [ "$1" == "hip" ]; then generate_hip
elif [ "$1" == "comgr" ]; then generate_comgr
@ -548,6 +561,7 @@ elif [ "$1" == "adreno" ]; then generate_adreno
elif [ "$1" == "pci" ]; then generate_pci
elif [ "$1" == "vfio" ]; then generate_vfio
elif [ "$1" == "webgpu" ]; then generate_webgpu
elif [ "$1" == "libusb" ]; then generate_libusb
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_nvrtc; generate_hsa; generate_kfd; generate_nv; generate_amd; generate_io_uring; generate_libc; generate_am; generate_webgpu
else echo "usage: $0 <type>"
fi

2
extra/usbgpu/.gitignore vendored Normal file
View file

@ -0,0 +1,2 @@
Software/
fw.zip

View file

@ -0,0 +1,88 @@
#!/usr/bin/env python3
import sys
import zlib
def patch(input_filepath, output_filepath, patches):
with open(input_filepath, 'rb') as infile: data = bytearray(infile.read())
for offset, expected_bytes, new_bytes in patches:
if len(expected_bytes) != len(new_bytes):
print(len(expected_bytes), len(new_bytes))
raise ValueError("Expected bytes and new bytes must be the same length")
if offset + len(new_bytes) > len(data): return False
current_bytes = data[offset:offset + len(expected_bytes)]
assert bytes(current_bytes) == expected_bytes, f"Expected {expected_bytes} at offset {offset:x}, but got {current_bytes}"
data[offset:offset + len(new_bytes)] = new_bytes
checksum = sum(data[4:-6]) & 0xff
crc32 = zlib.crc32(data[4:-6]).to_bytes(4, 'little')
data[-5] = checksum
data[-4] = crc32[0]
data[-3] = crc32[1]
data[-2] = crc32[2]
data[-1] = crc32[3]
with open(output_filepath, 'wb') as outfile:
outfile.write(data)
return True
patches = [
# (0x3903 + 1 + 4, b'\x8a', b'\x8b'),
# (0x3cf9 + 1 + 4, b'\x8a', b'\x8b'), # this is the one which triggered...
(0x2a0d + 1 + 4, b'\x0a', b'\x05'), # write handle exit with code 5 (?)
# (0x40e1 + 4, b'\x90\x06\xe6\x04\xf0\x78\x0d\xe6\xfe\x24\x71\x12\x1b\x0b\x60\x0b\x74\x08', b'\x7f\x00\x12\x53\x21\x12\x1c\xfc\x74\x01\xf6\x90\x90\x94\x74\x10\xf0\x22')
# (0x29ad + 1 + 4, b'\x09', b'\x05'), # write handle exit with code 5 (?)
# (0x40ef + 0 + 4, b'\x60', b'\x70'), # jz -> jnz
# (0x40e1 + 0 + 4, b'\x90', b'\x22'), # jmp -> ret
# (0x40fa + 0 + 4, b'\x80', b'\x22'),
# (0x40e1 + 0 + 4, b'\x90\x06\xe6\x04\xf0', b'\x7f\x00\x02\x41\x7c'), # jmp -> ret
]
next_traphandler = 0
def add_traphandler(addr, sec):
global next_traphandler, patches
trap_addr = 0x6000 + next_traphandler * 0x20
return_addr = addr + len(sec)
cntr_addr = 0x3000 + next_traphandler
patches += [
(addr + 4, sec, b'\x02' + trap_addr.to_bytes(2, 'big') + b'\x22'*(len(sec)-3)),
(trap_addr + 4, b'\x00' * (21 + len(sec)),
b'\xc0\xe0\xc0\x82\xc0\x83\x90' + cntr_addr.to_bytes(2, 'big') + b'\xe0\x04\xf0\xd0\x83\xd0\x82\xd0\xe0' + sec + b'\x02' + return_addr.to_bytes(2, 'big')),
]
next_traphandler += 1
# add_traphandler(0x0206, b'\xed\x54\x06') # fill_scsi_resp
# add_traphandler(0x40d9, b'\x78\x6a\xe6') # fill_scsi_to_usb_transport
# add_traphandler(0x4d44, b'\x78\x6a\xe6') # FUN_CODE_4d44
# add_traphandler(0x4784, b'\x78\x6a\xe6') # FUN_CODE_4784
# add_traphandler(0x3e81, b'\x90\xc5\x16') # FUN_CODE_3e81
# add_traphandler(0x32a5, b'\x78\x6a\xe6') # FUN_CODE_32a5
# add_traphandler(0x2a10, b'\x90\xc4\x51') # FUN_CODE_2a10
# add_traphandler(0x2608, b'\x12\x16\x87') # FUN_CODE_2608
# add_traphandler(0x0e78, b'\x90\xc8\x02') # main usb entry
# add_traphandler(0x102f, b'\x12\x18\x0d') # possible scsi entry parser
# add_traphandler(0x1198, b'\x12\x18\x0d') # close_to_scsi_parse_1_and_set_c47a_to_0xff caller to scsi
# add_traphandler(0x180d, b'\x90\x0a\x7d') # close_to_scsi_parse
# add_traphandler(0x1114, b'\x75\x37\x00') # entry into if ((DAT_EXTMEM_c802 >> 2 & 1) != 0) { in main usb entry
# add_traphandler(0x113a, b'\x90\x90\x00') # exit from scsi parse loop
# add_traphandler(0x117b, b'\xd0\x07\xd0\x06') # exit from main usb entry
# add_traphandler(0x2f81, b'\x90\x0a\x59') # main loop? 8
# add_traphandler(0xc7a7, b'\x90\x09\xfa') # call smth in write path 9
# add_traphandler(0x2fcb, b'\x90\x0a\x59') # if ((DAT_EXTMEM_0ae2 != 0) && (DAT_EXTMEM_0ae2 != 0x10)) {
# add_traphandler(0x2fc0, b'\x90\x0a\xe2') # submain loop 11
# add_traphandler(0x30be, b'\x90\x0a\x5a') # aft sub loop 12
# add_traphandler(0x3076, b'\x12\x03\x59') # call to call_wait_for_nvme??(); 13
# add_traphandler(0x30ad, b'\x12\x04\xe4') # call to call_wait_for_nvme??(); 14
# add_traphandler(0x2608, b'\x12\x16\x87') # FUN_CODE_2608
# add_traphandler(0x10ee, b'\x90\x04\x64') # iniside trap handler
# add_traphandler(0x10e0, b'\x90\xc8\x06') # iniside trap handler
# add_traphandler(0x4977, b'\x90\x0a\xa8') # waiter for nvme???
assert patch(sys.argv[1], sys.argv[2], patches) is True

View file

@ -0,0 +1,22 @@
import array, time, ctypes, struct, random
from hexdump import hexdump
from tinygrad.runtime.support.usb import ASMController, WriteOp
from tinygrad.runtime.autogen import pci
from tinygrad.helpers import Timing
from tinygrad import Device
usb = ASMController()
xxx = (ctypes.c_uint8 * 4096)()
dfg = random.randint(0, 255)
for i in range(len(xxx)): xxx[i] = dfg
print(dfg, usb.read(0xf000, 0x10))
with Timing():
for i in range(64): usb.scsi_write(xxx)
with Timing():
for i in range(64): usb.read(0xf000, 0x1000)
exit(0)

83
extra/usbgpu/patch.py Executable file
View file

@ -0,0 +1,83 @@
#!/usr/bin/env python3
import sys, os, zlib, struct
from hexdump import hexdump
from tinygrad.helpers import DEBUG, getenv, fetch
from tinygrad.runtime.support.usb import USB3
def patch(input_filepath, patches):
with open(input_filepath, 'rb') as infile: data = bytearray(infile.read())
for offset, expected_bytes, new_bytes in patches:
if len(expected_bytes) != len(new_bytes):
raise ValueError("Expected bytes and new bytes must be the same length")
if offset + len(new_bytes) > len(data): return False
current_bytes = data[offset:offset + len(expected_bytes)]
assert bytes(current_bytes) == expected_bytes, f"Expected {expected_bytes} at offset {offset:x}, but got {current_bytes}"
data[offset:offset + len(new_bytes)] = new_bytes
checksum = sum(data[4:-6]) & 0xff
crc32 = zlib.crc32(data[4:-6]).to_bytes(4, 'little')
data[-5] = checksum
data[-4] = crc32[0]
data[-3] = crc32[1]
data[-2] = crc32[2]
data[-1] = crc32[3]
return data
path = os.path.dirname(os.path.abspath(__file__))
file_path = os.path.join(path, "Software/AS_USB4_240417_85_00_00.bin")
if not os.path.exists(file_path):
url = "https://web.archive.org/web/20250430124720/https://www.station-drivers.com/index.php/en/component/remository/func-download/6341/chk,3ef8b04704a18eb2fc57ff60382379ad/no_html,1/lang,en-gb/"
os.system(f'curl -o "{path}/fw.zip" "{url}"')
os.system(f'unzip -o "{path}/fw.zip" "Software/AS_USB4_240417_85_00_00.bin" -d "{path}"')
patches = [(0x2a0d + 1 + 4, b'\x0a', b'\x05')]
patched_fw = patch(file_path, patches)
vendor, device = [int(x, base=16) for x in getenv("USBDEV", "174C:2464").split(":")]
try: dev = USB3(vendor, device, 0x81, 0x83, 0x02, 0x04)
except RuntimeError as e:
raise RuntimeError(f'{e}. You can set USBDEV environment variable to your device\'s vendor and device ID (e.g., USBDEV="174C:2464")') from e
config1 = bytes([
0xFF, 0xFF, 0xFF, 0xFF, 0x41, 0x41, 0x41, 0x41, 0x42, 0x42, 0x42, 0x42, 0x30, 0x30, 0x36, 0x30,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x74, 0x69, 0x6E, 0x79, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x74, 0x69, 0x6E, 0x79,
0xFF, 0xFF, 0xFF, 0xFF, 0x55, 0x53, 0x42, 0x20, 0x33, 0x2E, 0x32, 0x20, 0x50, 0x43, 0x49, 0x65,
0x20, 0x54, 0x69, 0x6E, 0x79, 0x45, 0x6E, 0x63, 0x6C, 0x6F, 0x73, 0x75, 0x72, 0x65, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0x54, 0x69, 0x6E, 0x79, 0x45, 0x6E, 0x63, 0x6C, 0x6F, 0x73, 0x75, 0x72,
0x65, 0xFF, 0xFF, 0xFF, 0xD1, 0xAD, 0x01, 0x00, 0x00, 0x01, 0xCF, 0xFF, 0x02, 0xFF, 0x5A, 0x94])
config2 = bytes([
0xFF, 0xFF, 0xFF, 0xFF, 0x47, 0x6F, 0x70, 0x6F, 0x64, 0x20, 0x47, 0x72, 0x6F, 0x75, 0x70, 0x20,
0x4C, 0x69, 0x6D, 0x69, 0x74, 0x65, 0x64, 0x2E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x55, 0x53, 0x42, 0x34,
0x20, 0x4E, 0x56, 0x4D, 0x65, 0x20, 0x53, 0x53, 0x44, 0x20, 0x50, 0x72, 0x6F, 0x20, 0x45, 0x6E,
0x63, 0x6C, 0x6F, 0x73, 0x75, 0x72, 0x65, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
0xFF, 0xFF, 0xFF, 0xFF, 0x8C, 0xBF, 0xFF, 0x97, 0xC1, 0xF3, 0xFF, 0xFF, 0x01, 0x2D, 0x66, 0xD6,
0x66, 0x06, 0x00, 0xC0, 0x87, 0x01, 0x5A, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xCA, 0x01, 0x66, 0xD6,
0xE3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x01, 0x00, 0xA5, 0x67])
part1 = patched_fw[:0xff00]
part2 = patched_fw[0xff00:]
# config patch
cdb = struct.pack('>BBB12x', 0xe1, 0x50, 0x0)
dev.send_batch(cdbs=[cdb], odata=[config1])
cdb = struct.pack('>BBB12x', 0xe1, 0x50, 0x1)
dev.send_batch(cdbs=[cdb], odata=[config2])
cdb = struct.pack('>BBI', 0xe3, 0x50, len(part1))
dev.send_batch(cdbs=[cdb], odata=[part1])
cdb = struct.pack('>BBI', 0xe3, 0xd0, len(part2))
dev.send_batch(cdbs=[cdb], odata=[part2])
cdb = struct.pack('>BB13x', 0xe8, 0x51)
dev.send_batch(cdbs=[cdb])
print("done, you can disconnect the controller!")

67
extra/usbgpu/scan_pci.py Normal file
View file

@ -0,0 +1,67 @@
import array, time
from hexdump import hexdump
from tinygrad.runtime.support.usb import ASM24Controller
from tinygrad.runtime.autogen import pci
usb = ASM24Controller()
def print_cfg(bus, dev):
cfg = []
for i in range(0, 256, 4):
cfg.append(usb.pcie_cfg_req(i, bus=bus, dev=dev, fn=0, value=None, size=4))
print("bus={}, dev={}".format(bus, dev))
dmp = bytearray(array.array('I', cfg))
hexdump(dmp)
return dmp
def rescan_bus(bus, gpu_bus):
print("set PCI_SUBORDINATE_BUS bus={} to {}".format(bus, gpu_bus))
usb.pcie_cfg_req(pci.PCI_SUBORDINATE_BUS, bus=bus, dev=0, fn=0, value=gpu_bus, size=1)
usb.pcie_cfg_req(pci.PCI_SECONDARY_BUS, bus=bus, dev=0, fn=0, value=bus+1, size=1)
usb.pcie_cfg_req(pci.PCI_PRIMARY_BUS, bus=bus, dev=0, fn=0, value=max(0, bus-1), size=1)
print("rescan bus={}".format(bus))
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_BUS_RESET, size=1)
time.sleep(0.1)
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_PARITY|pci.PCI_BRIDGE_CTL_SERR, size=1)
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x1000, size=2)
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0x2000, size=2)
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x2000, size=2)
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xffff, size=2)
print_cfg(0, 0)
rescan_bus(0, gpu_bus=4)
print_cfg(1, 0)
rescan_bus(1, gpu_bus=4)
time.sleep(0.1)
print_cfg(2, 0)
def setup_bus(bus, gpu_bus):
print("setup bus={}".format(bus))
usb.pcie_cfg_req(pci.PCI_SUBORDINATE_BUS, bus=bus, dev=0, fn=0, value=gpu_bus, size=1)
usb.pcie_cfg_req(pci.PCI_SECONDARY_BUS, bus=bus, dev=0, fn=0, value=bus+1, size=1)
usb.pcie_cfg_req(pci.PCI_PRIMARY_BUS, bus=bus, dev=0, fn=0, value=max(0, bus-1), size=1)
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_BUS_RESET, size=1)
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_PARITY|pci.PCI_BRIDGE_CTL_SERR, size=1)
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x1000, size=2)
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0x2000, size=2)
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x2000, size=2)
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xffff, size=2)
setup_bus(2, gpu_bus=4)
print_cfg(3, 0)
setup_bus(3, gpu_bus=4)
dmp = print_cfg(4, 0)
print(dmp[0:4])
assert dmp[0:4] == b"\x02\x10\x80\x74", "GPU NOT FOUND!"
print("GPU FOUND!")

View file

@ -0,0 +1,39 @@
import unittest, time
from tinygrad.runtime.support.usb import ASM24Controller
class TestASMController(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.ctrl = ASM24Controller()
def test_write_and_read(self):
base = 0xF000
data = b"hello!"
self.ctrl.write(base, data)
out = self.ctrl.read(base, len(data))
self.assertEqual(out, data)
def test_scsi_write_and_read_from_f000(self):
payload = bytes([0x5B]) * 4096
self.ctrl.scsi_write(payload, lba=0)
back = self.ctrl.read(0xF000, len(payload))
self.assertEqual(back, payload)
def test_scsi_write_speed_4k(self):
payload = bytes([0x5A]) * 4096
start = time.perf_counter()
self.ctrl.scsi_write(payload, lba=0)
dur_ms = (time.perf_counter() - start) * 1000
print(f"scsi_write 4K took {dur_ms:.3f} ms")
def test_read_speed_4k(self):
payload = bytes([0xA5]) * 4096
self.ctrl.write(0xF000, payload)
start = time.perf_counter()
out = self.ctrl.read(0xF000, 4096)
dur_ms = (time.perf_counter() - start) * 1000
print(f"read 4K took {dur_ms:.3f} ms")
self.assertEqual(out, payload)
if __name__ == "__main__":
unittest.main()

13
test/mockgpu/usb.py Normal file
View file

@ -0,0 +1,13 @@
class MockUSB:
def __init__(self, mem):
self.mem = mem
def read(self, address, size):
return bytes(self.mem[address:address+size])
def write(self, address, data):
self.mem[address:address+len(data)] = data
def pcie_mem_req(self, address, value=None, size=1):
if value is None: return int.from_bytes(self.mem[address:address+size], "little")
else: self.mem[address:address+size] = value.to_bytes(size, "little")

View file

@ -1,6 +1,8 @@
import unittest, array, time
from tinygrad.helpers import mv_address
from tinygrad.runtime.support.hcq import MMIOInterface
from tinygrad.runtime.support.usb import USBMMIOInterface
from test.mockgpu.usb import MockUSB
class TestHCQIface(unittest.TestCase):
def setUp(self):
@ -51,5 +53,53 @@ class TestHCQIface(unittest.TestCase):
mvend = time.perf_counter()
print(f"speed: hcq {end - start:.6f}s vs plain mv {mvend - mvstart:.6f}s")
class TestUSBMMIOInterface(unittest.TestCase):
def setUp(self):
self.size = 256
self.buffer = bytearray(self.size)
self.usb = MockUSB(self.buffer)
self.mmio = USBMMIOInterface(self.usb, 0, self.size, fmt='B', pcimem=False)
def test_getitem_setitem_byte(self):
self.mmio[1] = 0xAB
self.assertEqual(self.mmio[1], bytes([0xAB]))
self.assertEqual(self.usb.mem[1], 0xAB)
def test_slice_getitem_setitem(self):
values = [1, 2, 3, 4]
self.mmio[10:14] = values
raw = self.mmio[10:14]
self.assertIsInstance(raw, bytes)
self.assertEqual(list(raw), values)
self.assertEqual(list(self.usb.mem[10:14]), values)
def test_view(self):
self.mmio[0] = 5
view = self.mmio.view(offset=1, size=3)
self.assertEqual(view[0], bytes([self.usb.mem[1]]))
view[:] = [7, 8, 9]
self.assertEqual(list(self.usb.mem[1:4]), [7, 8, 9])
full_view = self.mmio.view()
self.assertEqual(len(full_view), len(self.mmio))
self.mmio[2] = 0xFE
self.assertEqual(full_view[2], bytes([0xFE]))
def test_pcimem_byte(self):
usb2 = MockUSB(bytearray(self.size))
mmio_pci = USBMMIOInterface(usb2, 0, self.size, fmt='B', pcimem=True)
mmio_pci[3] = 0x11
self.assertEqual(mmio_pci[3], 0x11)
self.assertEqual(usb2.mem[3], 0x11)
def test_pcimem_slice(self):
usb3 = MockUSB(bytearray(self.size))
mmio_pci = USBMMIOInterface(usb3, 0, self.size, fmt='B', pcimem=True)
values = [2, 3, 4]
mmio_pci[4:7] = values
raw = mmio_pci[4:7]
self.assertIsInstance(raw, bytes)
self.assertEqual(list(raw), values)
self.assertEqual([mmio_pci[i] for i in range(4, 7)], values)
if __name__ == "__main__":
unittest.main()

File diff suppressed because it is too large Load diff

View file

@ -15,7 +15,8 @@ from tinygrad.runtime.autogen.am import am
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
from tinygrad.runtime.support.amd import AMDRegBase, collect_registers, import_module
from tinygrad.runtime.support.amd import AMDRegBase, collect_registers, import_module, setup_pci_bars
from tinygrad.runtime.support.usb import ASM24Controller, USBMMIOInterface
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
@ -461,6 +462,8 @@ class AMDProgram(HCQProgram):
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
class AMDAllocator(HCQAllocator['AMDDevice']):
def __init__(self, dev:AMDDevice): super().__init__(dev, copy_bufs=getattr(dev.dev_iface, 'copy_bufs', None))
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
@ -722,14 +725,17 @@ class PCIIface:
bar_info = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2, fmt='Q'), self._map_pci_range(5, fmt='I'))
self.ip_versions = self.adev.ip_ver
self.ip_offsets = {hwip: tuple(instances[0]) for hwip,instances in self.adev.regs_offset.items()}
self._setup_adev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2, fmt='Q'), self._map_pci_range(5, fmt='I'))
self.doorbell_cpu_addr = dbell.addr
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
def _setup_adev(self, name, vram:MMIOInterface, doorbell:MMIOInterface, mmio:MMIOInterface):
self.adev = AMDev(name, vram, doorbell, mmio)
self.ip_versions = self.adev.ip_ver
self.ip_offsets = {hwip: tuple(instances[0]) for hwip,instances in self.adev.regs_offset.items()}
gfxver = int(f"{self.adev.ip_ver[am.GC_HWIP][0]:02d}{self.adev.ip_ver[am.GC_HWIP][1]:02d}{self.adev.ip_ver[am.GC_HWIP][2]:02d}")
array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
@ -796,16 +802,50 @@ class PCIIface:
def device_fini(self): self.adev.fini()
class USBIface(PCIIface):
def __init__(self, dev, dev_id):
self.dev = dev
self.usb = ASM24Controller()
self.bars = setup_pci_bars(self.usb, gpu_bus=4, mem_base=0x40000000, pref_mem_base=0x10000000)
self._setup_adev(f"usb:{dev_id}", USBMMIOInterface(self.usb, *self.bars[0], fmt='B'), USBMMIOInterface(self.usb, *self.bars[2], fmt='Q'),
USBMMIOInterface(self.usb, *self.bars[5], fmt='I'))
# special regions
copy_vaddr = self.adev.mm.alloc_vaddr(size=0x1000, align=0x1000)
self.copy_region = self.adev.mm.map_range(copy_vaddr, 0x1000, [(0x200000, 0x1000)], system=True, snooped=False, uncached=True)
self.copy_bufs = [HCQBuffer(copy_vaddr, 0x1000, meta=AMAllocationMeta(self.dev, [self.dev], self.copy_region, has_cpu_mapping=False),
view=USBMMIOInterface(self.usb, 0xf000, 0x1000, fmt='B', pcimem=False))]
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=False),
view=USBMMIOInterface(self.usb, self.bars[0][0] + am_mapping.paddrs[0][0], size, fmt='B') if cpu_access else None)
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
else:
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
return AMDQueueDesc(ring=self.adev.vram.view(ring.meta.mapping.paddrs[0][0], 0x8000, fmt='I'),
doorbells=[self.adev.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
read_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0], 8, fmt='Q')],
write_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0]+0x10, 8, fmt='Q')])
class AMDDevice(HCQCompiled):
devices: ClassVar[list[HCQCompiled]] = []
signal_pages: ClassVar[list[HCQBuffer]] = []
signal_pool: ClassVar[list[HCQBuffer]] = []
def is_am(self) -> bool: return isinstance(self.dev_iface, PCIIface)
def is_am(self) -> bool: return isinstance(self.dev_iface, (PCIIface, USBIface))
def is_usb(self) -> bool: return isinstance(self.dev_iface, USBIface)
def _select_iface(self):
errs:str = ""
for iface_t in (KFDIface, PCIIface) if len(nm:=getenv("AMD_IFACE", "")) == 0 else (getattr(sys.modules[__name__], f"{nm}Iface"),):
for iface_t in (KFDIface, PCIIface, USBIface) if len(nm:=getenv("AMD_IFACE", "")) == 0 else (getattr(sys.modules[__name__], f"{nm}Iface"),):
try: return iface_t(self, self.device_id)
except Exception as e: errs += f"\n{iface_t.__name__}: {type(e).__name__}: {e}"
raise RuntimeError(f"Cannot find a usable interface for AMD:{self.device_id}:{errs}")
@ -844,15 +884,16 @@ class AMDDevice(HCQCompiled):
nbio_pad = (0,) if self.target[0] == 9 else ()
self.nbio = AMDIP('nbio' if self.target[0]<12 else 'nbif', nbio_ver, nbio_pad+self.dev_iface.ip_offsets[am.NBIF_HWIP])
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x8000 if self.is_usb() else 0x800000,
ctx_save_restore_size=wg_data_size + ctl_stack_size, eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
max_copy_size = 0x40000000 if self.dev_iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x8000 if self.is_usb() else 0x800000)
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size))
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size),
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20))
# Scratch setup
self.max_private_segment_size = 0

View file

@ -1,5 +1,5 @@
from __future__ import annotations
import ctypes, collections, time, dataclasses, functools, fcntl, os, hashlib
import ctypes, collections, time, dataclasses, functools, fcntl, os, hashlib, array
from tinygrad.helpers import mv_address, getenv, round_up, DEBUG, temp, fetch
from tinygrad.runtime.autogen.am import am, mp_11_0
from tinygrad.runtime.support.hcq import MMIOInterface
@ -352,13 +352,24 @@ class AMDev:
time.sleep(0.001)
raise RuntimeError(f'wait_reg timeout reg=0x{reg.addr:X} mask=0x{mask:X} value=0x{value:X} last_val=0x{rval}')
def _read_vram(self, addr, size) -> bytes:
assert addr % 4 == 0 and size % 4 == 0, f"Invalid address {addr:#x} or size {size:#x}"
res = []
for caddr in range(addr, addr + size, 4):
self.wreg(0x06, caddr >> 31)
self.wreg(0x00, (caddr & 0x7FFFFFFF) | 0x80000000)
res.append(self.rreg(0x01))
return bytes(array.array('I', res))
def _run_discovery(self):
# NOTE: Fixed register to query memory size without known ip bases to find the discovery table.
# The table is located at the end of VRAM - 64KB and is 10KB in size.
mmRCC_CONFIG_MEMSIZE = 0xde3
self.vram_size = self.rreg(mmRCC_CONFIG_MEMSIZE) << 20
tmr_offset, tmr_size = self.vram_size - (64 << 10), (10 << 10)
self.bhdr = am.struct_binary_header.from_buffer(bytearray(self.vram.view(self.vram_size - (64 << 10), (10 << 10))[:]))
disc_tbl = self._read_vram(tmr_offset, tmr_size) if self.vram.nbytes < self.vram_size else self.vram.view(tmr_offset, tmr_size)[:]
self.bhdr = am.struct_binary_header.from_buffer(bytearray(disc_tbl))
ihdr = am.struct_ip_discovery_header.from_address(ctypes.addressof(self.bhdr) + self.bhdr.table_list[am.IP_DISCOVERY].offset)
assert ihdr.signature == am.DISCOVERY_TABLE_SIGNATURE and not ihdr.base_addr_64_bit, f"0x{ihdr.signature:X} != 0x{am.DISCOVERY_TABLE_SIGNATURE:X}"

View file

@ -1,7 +1,9 @@
import functools, importlib
import functools, importlib, time
from collections import defaultdict
from dataclasses import dataclass
from tinygrad.helpers import getbits
from tinygrad.helpers import getbits, round_up
from tinygrad.runtime.autogen import pci
from tinygrad.runtime.support.usb import ASM24Controller
@dataclass(frozen=True)
class AMDRegBase:
@ -29,3 +31,39 @@ def import_module(name:str, version:tuple[int, ...], version_prefix:str=""):
try: return importlib.import_module(f"tinygrad.runtime.autogen.am.{name}_{version_prefix}{'_'.join(map(str, ver))}")
except ImportError: pass
raise ImportError(f"Failed to load autogen module for {name.upper()} {'.'.join(map(str, version))}")
def setup_pci_bars(usb:ASM24Controller, gpu_bus:int, mem_base:int, pref_mem_base:int) -> dict[int, tuple[int, int]]:
try: usb.pcie_cfg_req(pci.PCI_VENDOR_ID, bus=gpu_bus, dev=0, fn=0, size=2)
except RuntimeError:
for bus in range(gpu_bus):
usb.pcie_cfg_req(pci.PCI_SUBORDINATE_BUS, bus=bus, dev=0, fn=0, value=gpu_bus, size=1)
usb.pcie_cfg_req(pci.PCI_SECONDARY_BUS, bus=bus, dev=0, fn=0, value=bus+1, size=1)
usb.pcie_cfg_req(pci.PCI_PRIMARY_BUS, bus=bus, dev=0, fn=0, value=max(0, bus-1), size=1)
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=mem_base>>16, size=2)
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xf000, size=2)
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=pref_mem_base>>16, size=2)
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=mem_base>>16, size=2)
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_BUS_RESET, size=1)
time.sleep(0.1)
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=0x0, size=1)
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
mem_space_addr, bar_off, bars = [mem_base, pref_mem_base], 0, {}
while bar_off < 24:
cfg = usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, size=4)
bar_mem, bar_space = bool(cfg & pci.PCI_BASE_ADDRESS_MEM_PREFETCH), cfg & pci.PCI_BASE_ADDRESS_SPACE
if bar_space == pci.PCI_BASE_ADDRESS_SPACE_MEMORY:
usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, value=0xffffffff, size=4)
bar_size = 0xffffffff - (usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, size=4) & 0xfffffff0) + 1
usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, value=mem_space_addr[bar_mem], size=4)
bars[bar_off // 4] = (mem_space_addr[bar_mem], bar_size)
mem_space_addr[bar_mem] += round_up(bar_size, 2 << 20)
# 64bit bar, zero out the upper 32 bits
if bar_space == pci.PCI_BASE_ADDRESS_MEM_TYPE_64: usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off + 4, bus=gpu_bus, dev=0, fn=0, value=0,size=4)
bar_off += 8 if cfg & pci.PCI_BASE_ADDRESS_MEM_TYPE_64 else 4
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=gpu_bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
return bars

View file

@ -0,0 +1,243 @@
import ctypes, struct, dataclasses, array
from typing import Sequence
from tinygrad.runtime.autogen import libusb
from tinygrad.helpers import DEBUG
from tinygrad.runtime.support.hcq import MMIOInterface
class USB3:
def __init__(self, vendor:int, dev:int, ep_data_in:int, ep_stat_in:int, ep_data_out:int, ep_cmd_out:int, max_streams:int=16, max_read_len:int=4096):
self.vendor, self.dev = vendor, dev
self.ep_data_in, self.ep_stat_in, self.ep_data_out, self.ep_cmd_out = ep_data_in, ep_stat_in, ep_data_out, ep_cmd_out
self.max_streams, self.max_read_len = max_streams, max_read_len
self.ctx = ctypes.POINTER(libusb.struct_libusb_context)()
if libusb.libusb_init(ctypes.byref(self.ctx)): raise RuntimeError("libusb_init failed")
if DEBUG >= 6: libusb.libusb_set_option(self.ctx, libusb.LIBUSB_OPTION_LOG_LEVEL, 4)
self.handle = libusb.libusb_open_device_with_vid_pid(self.ctx, self.vendor, self.dev)
if not self.handle: raise RuntimeError(f"device {self.vendor:04x}:{self.dev:04x} not found. sudo required?")
# Detach kernel driver if needed
if libusb.libusb_kernel_driver_active(self.handle, 0):
libusb.libusb_detach_kernel_driver(self.handle, 0)
libusb.libusb_reset_device(self.handle)
# Set configuration and claim interface
if libusb.libusb_set_configuration(self.handle, 1): raise RuntimeError("set_configuration failed")
if libusb.libusb_claim_interface(self.handle, 0): raise RuntimeError("claim_interface failed. sudo required?")
if libusb.libusb_set_interface_alt_setting(self.handle, 0, 1): raise RuntimeError("alt_setting failed")
# Clear any stalled endpoints
all_eps = (self.ep_data_out, self.ep_data_in, self.ep_stat_in, self.ep_cmd_out)
for ep in all_eps: libusb.libusb_clear_halt(self.handle, ep)
# Allocate streams
stream_eps = (ctypes.c_uint8 * 3)(self.ep_data_out, self.ep_data_in, self.ep_stat_in)
if (rc:=libusb.libusb_alloc_streams(self.handle, self.max_streams * len(stream_eps), stream_eps, len(stream_eps))) < 0:
raise RuntimeError(f"alloc_streams failed: {rc}")
# Base cmd
cmd_template = bytes([0x01, 0x00, 0x00, 0x01, *([0] * 12), 0xE4, 0x24, 0x00, 0xB2, 0x1A, 0x00, 0x00, 0x00, *([0] * 8)])
# Init pools
self.tr = {ep: [libusb.libusb_alloc_transfer(0) for _ in range(self.max_streams)] for ep in all_eps}
self.buf_cmd = [(ctypes.c_uint8 * len(cmd_template))(*cmd_template) for _ in range(self.max_streams)]
self.buf_stat = [(ctypes.c_uint8 * 64)() for _ in range(self.max_streams)]
self.buf_data_in = [(ctypes.c_uint8 * 0x1000)() for _ in range(self.max_streams)]
self.buf_data_out = [(ctypes.c_uint8 * 0x1000)() for _ in range(self.max_streams)]
def _prep_transfer(self, tr, ep, stream_id, buf, length):
tr.contents.dev_handle, tr.contents.endpoint, tr.contents.length, tr.contents.buffer = self.handle, ep, length, buf
tr.contents.status, tr.contents.flags, tr.contents.timeout, tr.contents.num_iso_packets = 0xff, 0, 1000, 0
tr.contents.type = (libusb.LIBUSB_TRANSFER_TYPE_BULK_STREAM if stream_id is not None else libusb.LIBUSB_TRANSFER_TYPE_BULK)
if stream_id is not None: libusb.libusb_transfer_set_stream_id(tr, stream_id)
return tr
def _submit_and_wait(self, cmds):
for tr in cmds: libusb.libusb_submit_transfer(tr)
running = len(cmds)
while running:
libusb.libusb_handle_events(self.ctx)
running = len(cmds)
for tr in cmds:
if tr.contents.status == libusb.LIBUSB_TRANSFER_COMPLETED: running -= 1
elif tr.contents.status != 0xFF: raise RuntimeError(f"EP 0x{tr.contents.endpoint:02X} error: {tr.contents.status}")
def send_batch(self, cdbs:list[bytes], idata:list[int]|None=None, odata:list[bytes|None]|None=None) -> list[bytes|None]:
idata, odata = idata or [0] * len(cdbs), odata or [None] * len(cdbs)
results, tr_window, op_window = [], [], []
for idx, (cdb, rlen, send_data) in enumerate(zip(cdbs, idata, odata)):
# allocate slot and stream. stream is 1-based
slot, stream = idx % self.max_streams, (idx % self.max_streams) + 1
# build cmd packet
struct.pack_into(">B", self.buf_cmd[slot], 3, stream)
self.buf_cmd[slot][16:16+len(cdb)] = list(cdb)
# cmd + stat transfers
tr_window.append(self._prep_transfer(self.tr[self.ep_cmd_out][slot], self.ep_cmd_out, None, self.buf_cmd[slot], len(self.buf_cmd[slot])))
tr_window.append(self._prep_transfer(self.tr[self.ep_stat_in][slot], self.ep_stat_in, stream, self.buf_stat[slot], 64))
if rlen:
if rlen > self.max_read_len: raise ValueError("read length > max_read_len per CDB")
tr_window.append(self._prep_transfer(self.tr[self.ep_data_in][slot], self.ep_data_in, stream, self.buf_data_in[slot], rlen))
if send_data is not None:
if len(send_data) > len(self.buf_data_out[slot]): self.buf_data_out[slot] = (ctypes.c_uint8 * len(send_data))()
self.buf_data_out[slot][:len(send_data)] = list(send_data)
tr_window.append(self._prep_transfer(self.tr[self.ep_data_out][slot], self.ep_data_out, stream, self.buf_data_out[slot], len(send_data)))
op_window.append((idx, slot, rlen))
if (idx + 1 == len(cdbs)) or len(op_window) >= self.max_streams:
self._submit_and_wait(tr_window)
for idx, slot, rlen in op_window: results.append(bytes(self.buf_data_in[slot][:rlen]) if rlen else None)
tr_window = []
return results
@dataclasses.dataclass(frozen=True)
class WriteOp: addr:int; data:bytes; ignore_cache:bool=True # noqa: E702
@dataclasses.dataclass(frozen=True)
class ReadOp: addr:int; size:int # noqa: E702
@dataclasses.dataclass(frozen=True)
class ScsiWriteOp: data:bytes; lba:int=0 # noqa: E702
class ASM24Controller:
def __init__(self):
self.usb = USB3(0xADD1, 0x0001, 0x81, 0x83, 0x02, 0x04)
self._cache: dict[int, int|None] = {}
# Init controller.
self.exec_ops([WriteOp(0x54b, b' '), WriteOp(0x5a8, b'\x02'), WriteOp(0x5f8, b'\x04'), WriteOp(0x7ec, b'\x01\x00\x00\x00'),
WriteOp(0xc422, b'\x02'), WriteOp(0x0, b'\x33')])
def exec_ops(self, ops:Sequence[WriteOp|ReadOp|ScsiWriteOp]):
cdbs:list[bytes] = []
idata:list[int] = []
odata:list[bytes|None] = []
def _add_req(cdb:bytes, i:int, o:bytes|None):
nonlocal cdbs, idata, odata
cdbs, idata, odata = cdbs + [cdb], idata + [i], odata + [o]
for op in ops:
if isinstance(op, WriteOp):
for off, value in enumerate(op.data):
addr = ((op.addr + off) & 0x1FFFF) | 0x500000
if not op.ignore_cache and self._cache.get(addr) == value: continue
_add_req(struct.pack('>BBBHB', 0xE5, value, addr >> 16, addr & 0xFFFF, 0), 0, None)
self._cache[addr] = value
elif isinstance(op, ReadOp):
assert op.size <= 0xff
addr = (op.addr & 0x1FFFF) | 0x500000
_add_req(struct.pack('>BBBHB', 0xE4, op.size, addr >> 16, addr & 0xFFFF, 0), op.size, None)
for i in range(op.size): self._cache[addr + i] = None
elif isinstance(op, ScsiWriteOp): _add_req(struct.pack('>BBQIBB', 0x8A, 0, op.lba, 4096//512, 0, 0), 0, op.data+b'\x00'*(4096-len(op.data)))
return self.usb.send_batch(cdbs, idata, odata)
def write(self, base_addr:int, data:bytes, ignore_cache:bool=True): return self.exec_ops([WriteOp(base_addr, data, ignore_cache)])
def scsi_write(self, buf:bytes, lba:int=0):
self.exec_ops([ScsiWriteOp(buf, lba), WriteOp(0x171, b'\xff\xff\xff', ignore_cache=True), WriteOp(0xce6e, b'\x00\x00', ignore_cache=True)])
def read(self, base_addr:int, length:int, stride:int=0xff) -> bytes:
parts = self.exec_ops([ReadOp(base_addr + off, min(stride, length - off)) for off in range(0, length, stride)])
return b''.join(p or b'' for p in parts)[:length]
def pcie_request(self, fmt_type, address, value=None, size=4, cnt=10):
assert fmt_type >> 8 == 0 and size > 0 and size <= 4, f"Invalid fmt_type {fmt_type} or size {size}"
if DEBUG >= 3: print("pcie_request", hex(fmt_type), hex(address), value, size, cnt)
masked_address, offset = address & 0xFFFFFFFC, address & 0x3
assert size + offset <= 4
ops = []
if value is not None:
assert value >> (8 * size) == 0
ops.append(WriteOp(0xB220, struct.pack('>I', value << (8 * offset)), ignore_cache=False))
ops += [WriteOp(0xB218, struct.pack('>I', masked_address), ignore_cache=False),
WriteOp(0xB217, bytes([((1 << size) - 1) << offset]), ignore_cache=False),
WriteOp(0xB210, bytes([fmt_type]), ignore_cache=False),
WriteOp(0xB254, b"\x0f", ignore_cache=True), WriteOp(0xB296, b"\x04", ignore_cache=True)]
self.exec_ops(ops)
# Fast path for write requests
if ((fmt_type & 0b11011111) == 0b01000000) or ((fmt_type & 0b10111000) == 0b00110000): return
while (stat:=self.read(0xB296, 1)[0]) & 2 == 0:
if stat & 1:
self.write(0xB296, bytes([0x01]))
if cnt > 0: return self.pcie_request(fmt_type, address, value, size, cnt=cnt-1)
assert stat == 2, f"stat read 2 was {stat}"
# Retrieve completion data from Link Status (0xB22A, 0xB22B)
b284 = self.read(0xB284, 1)[0]
completion = struct.unpack('>H', self.read(0xB22A, 2))
# Validate completion status based on PCIe request typ
# Completion TLPs for configuration requests always have a byte count of 4.
assert completion[0] & 0xfff == (4 if (fmt_type & 0xbe == 0x04) else size)
# Extract completion status field
status = (completion[0] >> 13) & 0x7
# Handle completion errors or inconsistencies
if status or ((fmt_type & 0xbe == 0x04) and (((value is None) and (not (b284 & 0x01))) or ((value is not None) and (b284 & 0x01)))):
status_map = {0b000: "Successful Completion (SC)", 0b001: "Unsupported Request (UR)",
0b010: "Configuration Request Retry Status (CRS)", 0b100: "Completer Abort (CA)"}
raise RuntimeError("Completion status: {}, 0xB284 bit 0: {}".format(status_map.get(status, "Reserved (0b{:03b})".format(status)), b284 & 0x01))
if value is None: return (struct.unpack('>I', self.read(0xB220, 4))[0] >> (8 * offset)) & ((1 << (8 * size)) - 1)
def pcie_cfg_req(self, byte_addr, bus=1, dev=0, fn=0, value=None, size=4):
assert byte_addr >> 12 == 0 and bus >> 8 == 0 and dev >> 5 == 0 and fn >> 3 == 0, f"Invalid byte_addr {byte_addr}, bus {bus}, dev {dev}, fn {fn}"
fmt_type = (0x44 if value is not None else 0x4) | int(bus > 0)
address = (bus << 24) | (dev << 19) | (fn << 16) | (byte_addr & 0xfff)
return self.pcie_request(fmt_type, address, value, size)
def pcie_mem_req(self, address, value=None, size=4): return self.pcie_request(0x40 if value is not None else 0x0, address, value, size)
class USBMMIOInterface(MMIOInterface):
def __init__(self, usb, addr, size, fmt, pcimem=True):
self.usb, self.addr, self.nbytes, self.fmt, self.pcimem, self.el_sz = usb, addr, size, fmt, pcimem, struct.calcsize(fmt)
def __getitem__(self, index): return self._access_items(index)
def __setitem__(self, index, val): self._access_items(index, val)
def _access_items(self, index, val=None):
if isinstance(index, slice): return self._acc((index.start or 0) * self.el_sz, ((index.stop or len(self))-(index.start or 0)) * self.el_sz, val)
return self._acc_one(index * self.el_sz, self.el_sz, val) if self.pcimem else self._acc(index * self.el_sz, self.el_sz, val)
def view(self, offset:int=0, size:int|None=None, fmt=None):
return USBMMIOInterface(self.usb, self.addr+offset, size or (self.nbytes - offset), fmt=fmt or self.fmt, pcimem=self.pcimem)
def _acc_size(self, sz): return next(x for x in [('I', 4), ('H', 2), ('B', 1)] if sz % x[1] == 0)
def _acc_one(self, off, sz, val=None):
upper = 0 if sz < 8 else self.usb.pcie_mem_req(self.addr + off + 4, val if val is None else (val >> 32), 4)
lower = self.usb.pcie_mem_req(self.addr + off, val if val is None else val & 0xffffffff, min(sz, 4))
if val is None: return lower | (upper << 32)
def _acc(self, off, sz, data=None):
if data is None: # read op
if not self.pcimem: return self.usb.read(self.addr + off, sz)
acc, acc_size = self._acc_size(sz)
return bytes(array.array(acc, [self._acc_one(off + i * acc_size, acc_size) for i in range(sz // acc_size)]))
else: # write op
data = struct.pack(self.fmt, data) if isinstance(data, int) else bytes(data)
if not self.pcimem:
# Fast path for writing into buffer 0xf000
return self.usb.scsi_write(bytes(data)) if self.addr == 0xf000 else self.usb.write(self.addr + off, bytes(data))
_, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt))
for i in range(0, len(data), acc_sz): self._acc_one(off + i, acc_sz, int.from_bytes(data[i:i+acc_sz], "little"))