mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
usb gpu (#8766)
* start gpu * progress * fixes * read correct * libusb * libusb works * support asm24 * hmm * one access file * fix extra * start AMBar * works on am * back to usb * patch fw * full fast write into a bar * ugh, minus one gpus, next please * mute libusb for now * usb for asm24 * 63 * hmm * ops * rescan * and gpu shoudl be there * enumerate them? * usbgpu bus 4, 100% reliable (draft) * lil * works * comments * add DEBUG * cleaner * simplest * Revert "simplest" This reverts commit1d00354c16. * Revert "cleaner" This reverts commitc5662de956. * assert we find gpu * that's simpler * this back * simpler? * correcT * work * nonsense * works with more checks * this works * the 6s in the right place * reliable now * fix after reboot * set config * 1s timeouts * close to fw loading * streams * usbhub works * endpoints * fix * want to test tiny10 * move to tiny 10 * fix gpu * ugly speed * smth * mostly broken, but signals and dmas * do not reset gpu every time * changes to run kernels * ugh, not working * t10 * pg and sc files * some prog * um? * somehow it works * patched for 24 * some tries * minimal * moving * back to working * so sloooooow * move to controller * usb.py rewrite * rework * cleaner 1 * cleaner 2 * cleaner 3 * new abstractions * aft merge * init controller * cleaner 4 * cleaner 5 * patcher + tiny changes * ignore that * cleaner 6 * after rebase * cleaner 7 * bring it back * start linter war * linter 2 * autogen was missing * fix autogen * typing * better? * mypy * extra/legacy rename and cleaner * shuffle * better printing * tiny changes and tests --------- Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
parent
7573c0ef4e
commit
30bd6a619f
14 changed files with 2368 additions and 14 deletions
|
|
@ -528,6 +528,19 @@ generate_webgpu() {
|
|||
python3 -c "import tinygrad.runtime.autogen.webgpu"
|
||||
}
|
||||
|
||||
generate_libusb() {
|
||||
clang2py -k cdefstum \
|
||||
/usr/include/libusb-1.0/libusb.h \
|
||||
-o $BASE/libusb.py
|
||||
|
||||
fixup $BASE/libusb.py
|
||||
sed -i "s\import ctypes\import ctypes, os\g" $BASE/libusb.py
|
||||
sed -i "s/FIXME_STUB/libusb/g" "$BASE/libusb.py"
|
||||
sed -i "s/libusb_le16_to_cpu = libusb_cpu_to_le16//g" "$BASE/libusb.py"
|
||||
sed -i "s/FunctionFactoryStub()/None if (lib_path:=os.getenv('LIBUSB_PATH', ctypes.util.find_library('usb-1.0'))) is None else ctypes.CDLL(lib_path)/g" "$BASE/libusb.py"
|
||||
python3 -c "import tinygrad.runtime.autogen.libusb"
|
||||
}
|
||||
|
||||
if [ "$1" == "opencl" ]; then generate_opencl
|
||||
elif [ "$1" == "hip" ]; then generate_hip
|
||||
elif [ "$1" == "comgr" ]; then generate_comgr
|
||||
|
|
@ -548,6 +561,7 @@ elif [ "$1" == "adreno" ]; then generate_adreno
|
|||
elif [ "$1" == "pci" ]; then generate_pci
|
||||
elif [ "$1" == "vfio" ]; then generate_vfio
|
||||
elif [ "$1" == "webgpu" ]; then generate_webgpu
|
||||
elif [ "$1" == "libusb" ]; then generate_libusb
|
||||
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_nvrtc; generate_hsa; generate_kfd; generate_nv; generate_amd; generate_io_uring; generate_libc; generate_am; generate_webgpu
|
||||
else echo "usage: $0 <type>"
|
||||
fi
|
||||
|
|
|
|||
2
extra/usbgpu/.gitignore
vendored
Normal file
2
extra/usbgpu/.gitignore
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
|||
Software/
|
||||
fw.zip
|
||||
88
extra/usbgpu/legacy/patch_exp.py
Executable file
88
extra/usbgpu/legacy/patch_exp.py
Executable file
|
|
@ -0,0 +1,88 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import zlib
|
||||
|
||||
def patch(input_filepath, output_filepath, patches):
|
||||
with open(input_filepath, 'rb') as infile: data = bytearray(infile.read())
|
||||
|
||||
for offset, expected_bytes, new_bytes in patches:
|
||||
if len(expected_bytes) != len(new_bytes):
|
||||
print(len(expected_bytes), len(new_bytes))
|
||||
raise ValueError("Expected bytes and new bytes must be the same length")
|
||||
|
||||
if offset + len(new_bytes) > len(data): return False
|
||||
current_bytes = data[offset:offset + len(expected_bytes)]
|
||||
assert bytes(current_bytes) == expected_bytes, f"Expected {expected_bytes} at offset {offset:x}, but got {current_bytes}"
|
||||
data[offset:offset + len(new_bytes)] = new_bytes
|
||||
|
||||
checksum = sum(data[4:-6]) & 0xff
|
||||
crc32 = zlib.crc32(data[4:-6]).to_bytes(4, 'little')
|
||||
data[-5] = checksum
|
||||
data[-4] = crc32[0]
|
||||
data[-3] = crc32[1]
|
||||
data[-2] = crc32[2]
|
||||
data[-1] = crc32[3]
|
||||
|
||||
with open(output_filepath, 'wb') as outfile:
|
||||
outfile.write(data)
|
||||
|
||||
return True
|
||||
|
||||
patches = [
|
||||
# (0x3903 + 1 + 4, b'\x8a', b'\x8b'),
|
||||
# (0x3cf9 + 1 + 4, b'\x8a', b'\x8b'), # this is the one which triggered...
|
||||
|
||||
(0x2a0d + 1 + 4, b'\x0a', b'\x05'), # write handle exit with code 5 (?)
|
||||
# (0x40e1 + 4, b'\x90\x06\xe6\x04\xf0\x78\x0d\xe6\xfe\x24\x71\x12\x1b\x0b\x60\x0b\x74\x08', b'\x7f\x00\x12\x53\x21\x12\x1c\xfc\x74\x01\xf6\x90\x90\x94\x74\x10\xf0\x22')
|
||||
# (0x29ad + 1 + 4, b'\x09', b'\x05'), # write handle exit with code 5 (?)
|
||||
# (0x40ef + 0 + 4, b'\x60', b'\x70'), # jz -> jnz
|
||||
# (0x40e1 + 0 + 4, b'\x90', b'\x22'), # jmp -> ret
|
||||
# (0x40fa + 0 + 4, b'\x80', b'\x22'),
|
||||
# (0x40e1 + 0 + 4, b'\x90\x06\xe6\x04\xf0', b'\x7f\x00\x02\x41\x7c'), # jmp -> ret
|
||||
]
|
||||
|
||||
next_traphandler = 0
|
||||
def add_traphandler(addr, sec):
|
||||
global next_traphandler, patches
|
||||
|
||||
trap_addr = 0x6000 + next_traphandler * 0x20
|
||||
return_addr = addr + len(sec)
|
||||
cntr_addr = 0x3000 + next_traphandler
|
||||
patches += [
|
||||
(addr + 4, sec, b'\x02' + trap_addr.to_bytes(2, 'big') + b'\x22'*(len(sec)-3)),
|
||||
(trap_addr + 4, b'\x00' * (21 + len(sec)),
|
||||
b'\xc0\xe0\xc0\x82\xc0\x83\x90' + cntr_addr.to_bytes(2, 'big') + b'\xe0\x04\xf0\xd0\x83\xd0\x82\xd0\xe0' + sec + b'\x02' + return_addr.to_bytes(2, 'big')),
|
||||
]
|
||||
next_traphandler += 1
|
||||
|
||||
# add_traphandler(0x0206, b'\xed\x54\x06') # fill_scsi_resp
|
||||
# add_traphandler(0x40d9, b'\x78\x6a\xe6') # fill_scsi_to_usb_transport
|
||||
# add_traphandler(0x4d44, b'\x78\x6a\xe6') # FUN_CODE_4d44
|
||||
# add_traphandler(0x4784, b'\x78\x6a\xe6') # FUN_CODE_4784
|
||||
# add_traphandler(0x3e81, b'\x90\xc5\x16') # FUN_CODE_3e81
|
||||
# add_traphandler(0x32a5, b'\x78\x6a\xe6') # FUN_CODE_32a5
|
||||
# add_traphandler(0x2a10, b'\x90\xc4\x51') # FUN_CODE_2a10
|
||||
# add_traphandler(0x2608, b'\x12\x16\x87') # FUN_CODE_2608
|
||||
# add_traphandler(0x0e78, b'\x90\xc8\x02') # main usb entry
|
||||
# add_traphandler(0x102f, b'\x12\x18\x0d') # possible scsi entry parser
|
||||
# add_traphandler(0x1198, b'\x12\x18\x0d') # close_to_scsi_parse_1_and_set_c47a_to_0xff caller to scsi
|
||||
# add_traphandler(0x180d, b'\x90\x0a\x7d') # close_to_scsi_parse
|
||||
# add_traphandler(0x1114, b'\x75\x37\x00') # entry into if ((DAT_EXTMEM_c802 >> 2 & 1) != 0) { in main usb entry
|
||||
# add_traphandler(0x113a, b'\x90\x90\x00') # exit from scsi parse loop
|
||||
# add_traphandler(0x117b, b'\xd0\x07\xd0\x06') # exit from main usb entry
|
||||
|
||||
|
||||
# add_traphandler(0x2f81, b'\x90\x0a\x59') # main loop? 8
|
||||
# add_traphandler(0xc7a7, b'\x90\x09\xfa') # call smth in write path 9
|
||||
# add_traphandler(0x2fcb, b'\x90\x0a\x59') # if ((DAT_EXTMEM_0ae2 != 0) && (DAT_EXTMEM_0ae2 != 0x10)) {
|
||||
# add_traphandler(0x2fc0, b'\x90\x0a\xe2') # submain loop 11
|
||||
# add_traphandler(0x30be, b'\x90\x0a\x5a') # aft sub loop 12
|
||||
# add_traphandler(0x3076, b'\x12\x03\x59') # call to call_wait_for_nvme??(); 13
|
||||
# add_traphandler(0x30ad, b'\x12\x04\xe4') # call to call_wait_for_nvme??(); 14
|
||||
|
||||
# add_traphandler(0x2608, b'\x12\x16\x87') # FUN_CODE_2608
|
||||
# add_traphandler(0x10ee, b'\x90\x04\x64') # iniside trap handler
|
||||
# add_traphandler(0x10e0, b'\x90\xc8\x06') # iniside trap handler
|
||||
# add_traphandler(0x4977, b'\x90\x0a\xa8') # waiter for nvme???
|
||||
|
||||
assert patch(sys.argv[1], sys.argv[2], patches) is True
|
||||
22
extra/usbgpu/legacy/wr_speed.py
Normal file
22
extra/usbgpu/legacy/wr_speed.py
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
import array, time, ctypes, struct, random
|
||||
from hexdump import hexdump
|
||||
from tinygrad.runtime.support.usb import ASMController, WriteOp
|
||||
from tinygrad.runtime.autogen import pci
|
||||
from tinygrad.helpers import Timing
|
||||
from tinygrad import Device
|
||||
|
||||
usb = ASMController()
|
||||
|
||||
xxx = (ctypes.c_uint8 * 4096)()
|
||||
dfg = random.randint(0, 255)
|
||||
for i in range(len(xxx)): xxx[i] = dfg
|
||||
|
||||
print(dfg, usb.read(0xf000, 0x10))
|
||||
|
||||
with Timing():
|
||||
for i in range(64): usb.scsi_write(xxx)
|
||||
|
||||
with Timing():
|
||||
for i in range(64): usb.read(0xf000, 0x1000)
|
||||
|
||||
exit(0)
|
||||
83
extra/usbgpu/patch.py
Executable file
83
extra/usbgpu/patch.py
Executable file
|
|
@ -0,0 +1,83 @@
|
|||
#!/usr/bin/env python3
|
||||
import sys, os, zlib, struct
|
||||
from hexdump import hexdump
|
||||
from tinygrad.helpers import DEBUG, getenv, fetch
|
||||
from tinygrad.runtime.support.usb import USB3
|
||||
|
||||
def patch(input_filepath, patches):
|
||||
with open(input_filepath, 'rb') as infile: data = bytearray(infile.read())
|
||||
|
||||
for offset, expected_bytes, new_bytes in patches:
|
||||
if len(expected_bytes) != len(new_bytes):
|
||||
raise ValueError("Expected bytes and new bytes must be the same length")
|
||||
|
||||
if offset + len(new_bytes) > len(data): return False
|
||||
current_bytes = data[offset:offset + len(expected_bytes)]
|
||||
assert bytes(current_bytes) == expected_bytes, f"Expected {expected_bytes} at offset {offset:x}, but got {current_bytes}"
|
||||
data[offset:offset + len(new_bytes)] = new_bytes
|
||||
|
||||
checksum = sum(data[4:-6]) & 0xff
|
||||
crc32 = zlib.crc32(data[4:-6]).to_bytes(4, 'little')
|
||||
data[-5] = checksum
|
||||
data[-4] = crc32[0]
|
||||
data[-3] = crc32[1]
|
||||
data[-2] = crc32[2]
|
||||
data[-1] = crc32[3]
|
||||
return data
|
||||
|
||||
path = os.path.dirname(os.path.abspath(__file__))
|
||||
file_path = os.path.join(path, "Software/AS_USB4_240417_85_00_00.bin")
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
url = "https://web.archive.org/web/20250430124720/https://www.station-drivers.com/index.php/en/component/remository/func-download/6341/chk,3ef8b04704a18eb2fc57ff60382379ad/no_html,1/lang,en-gb/"
|
||||
os.system(f'curl -o "{path}/fw.zip" "{url}"')
|
||||
os.system(f'unzip -o "{path}/fw.zip" "Software/AS_USB4_240417_85_00_00.bin" -d "{path}"')
|
||||
|
||||
patches = [(0x2a0d + 1 + 4, b'\x0a', b'\x05')]
|
||||
patched_fw = patch(file_path, patches)
|
||||
|
||||
vendor, device = [int(x, base=16) for x in getenv("USBDEV", "174C:2464").split(":")]
|
||||
try: dev = USB3(vendor, device, 0x81, 0x83, 0x02, 0x04)
|
||||
except RuntimeError as e:
|
||||
raise RuntimeError(f'{e}. You can set USBDEV environment variable to your device\'s vendor and device ID (e.g., USBDEV="174C:2464")') from e
|
||||
|
||||
config1 = bytes([
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0x41, 0x41, 0x41, 0x41, 0x42, 0x42, 0x42, 0x42, 0x30, 0x30, 0x36, 0x30,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x74, 0x69, 0x6E, 0x79, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x74, 0x69, 0x6E, 0x79,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0x55, 0x53, 0x42, 0x20, 0x33, 0x2E, 0x32, 0x20, 0x50, 0x43, 0x49, 0x65,
|
||||
0x20, 0x54, 0x69, 0x6E, 0x79, 0x45, 0x6E, 0x63, 0x6C, 0x6F, 0x73, 0x75, 0x72, 0x65, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0x54, 0x69, 0x6E, 0x79, 0x45, 0x6E, 0x63, 0x6C, 0x6F, 0x73, 0x75, 0x72,
|
||||
0x65, 0xFF, 0xFF, 0xFF, 0xD1, 0xAD, 0x01, 0x00, 0x00, 0x01, 0xCF, 0xFF, 0x02, 0xFF, 0x5A, 0x94])
|
||||
|
||||
config2 = bytes([
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0x47, 0x6F, 0x70, 0x6F, 0x64, 0x20, 0x47, 0x72, 0x6F, 0x75, 0x70, 0x20,
|
||||
0x4C, 0x69, 0x6D, 0x69, 0x74, 0x65, 0x64, 0x2E, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x55, 0x53, 0x42, 0x34,
|
||||
0x20, 0x4E, 0x56, 0x4D, 0x65, 0x20, 0x53, 0x53, 0x44, 0x20, 0x50, 0x72, 0x6F, 0x20, 0x45, 0x6E,
|
||||
0x63, 0x6C, 0x6F, 0x73, 0x75, 0x72, 0x65, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
|
||||
0xFF, 0xFF, 0xFF, 0xFF, 0x8C, 0xBF, 0xFF, 0x97, 0xC1, 0xF3, 0xFF, 0xFF, 0x01, 0x2D, 0x66, 0xD6,
|
||||
0x66, 0x06, 0x00, 0xC0, 0x87, 0x01, 0x5A, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xCA, 0x01, 0x66, 0xD6,
|
||||
0xE3, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0xFF, 0xFF, 0x01, 0x00, 0xA5, 0x67])
|
||||
|
||||
part1 = patched_fw[:0xff00]
|
||||
part2 = patched_fw[0xff00:]
|
||||
|
||||
# config patch
|
||||
cdb = struct.pack('>BBB12x', 0xe1, 0x50, 0x0)
|
||||
dev.send_batch(cdbs=[cdb], odata=[config1])
|
||||
|
||||
cdb = struct.pack('>BBB12x', 0xe1, 0x50, 0x1)
|
||||
dev.send_batch(cdbs=[cdb], odata=[config2])
|
||||
|
||||
cdb = struct.pack('>BBI', 0xe3, 0x50, len(part1))
|
||||
dev.send_batch(cdbs=[cdb], odata=[part1])
|
||||
|
||||
cdb = struct.pack('>BBI', 0xe3, 0xd0, len(part2))
|
||||
dev.send_batch(cdbs=[cdb], odata=[part2])
|
||||
|
||||
cdb = struct.pack('>BB13x', 0xe8, 0x51)
|
||||
dev.send_batch(cdbs=[cdb])
|
||||
|
||||
print("done, you can disconnect the controller!")
|
||||
67
extra/usbgpu/scan_pci.py
Normal file
67
extra/usbgpu/scan_pci.py
Normal file
|
|
@ -0,0 +1,67 @@
|
|||
import array, time
|
||||
from hexdump import hexdump
|
||||
from tinygrad.runtime.support.usb import ASM24Controller
|
||||
from tinygrad.runtime.autogen import pci
|
||||
|
||||
usb = ASM24Controller()
|
||||
|
||||
def print_cfg(bus, dev):
|
||||
cfg = []
|
||||
for i in range(0, 256, 4):
|
||||
cfg.append(usb.pcie_cfg_req(i, bus=bus, dev=dev, fn=0, value=None, size=4))
|
||||
|
||||
print("bus={}, dev={}".format(bus, dev))
|
||||
dmp = bytearray(array.array('I', cfg))
|
||||
hexdump(dmp)
|
||||
return dmp
|
||||
|
||||
def rescan_bus(bus, gpu_bus):
|
||||
print("set PCI_SUBORDINATE_BUS bus={} to {}".format(bus, gpu_bus))
|
||||
usb.pcie_cfg_req(pci.PCI_SUBORDINATE_BUS, bus=bus, dev=0, fn=0, value=gpu_bus, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_SECONDARY_BUS, bus=bus, dev=0, fn=0, value=bus+1, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_PRIMARY_BUS, bus=bus, dev=0, fn=0, value=max(0, bus-1), size=1)
|
||||
|
||||
print("rescan bus={}".format(bus))
|
||||
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_BUS_RESET, size=1)
|
||||
time.sleep(0.1)
|
||||
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_PARITY|pci.PCI_BRIDGE_CTL_SERR, size=1)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x1000, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0x2000, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x2000, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xffff, size=2)
|
||||
|
||||
print_cfg(0, 0)
|
||||
rescan_bus(0, gpu_bus=4)
|
||||
|
||||
print_cfg(1, 0)
|
||||
rescan_bus(1, gpu_bus=4)
|
||||
|
||||
time.sleep(0.1)
|
||||
print_cfg(2, 0)
|
||||
|
||||
def setup_bus(bus, gpu_bus):
|
||||
print("setup bus={}".format(bus))
|
||||
usb.pcie_cfg_req(pci.PCI_SUBORDINATE_BUS, bus=bus, dev=0, fn=0, value=gpu_bus, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_SECONDARY_BUS, bus=bus, dev=0, fn=0, value=bus+1, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_PRIMARY_BUS, bus=bus, dev=0, fn=0, value=max(0, bus-1), size=1)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_BUS_RESET, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_PARITY|pci.PCI_BRIDGE_CTL_SERR, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x1000, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0x2000, size=2)
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=0x2000, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xffff, size=2)
|
||||
|
||||
setup_bus(2, gpu_bus=4)
|
||||
print_cfg(3, 0)
|
||||
|
||||
setup_bus(3, gpu_bus=4)
|
||||
dmp = print_cfg(4, 0)
|
||||
print(dmp[0:4])
|
||||
assert dmp[0:4] == b"\x02\x10\x80\x74", "GPU NOT FOUND!"
|
||||
|
||||
print("GPU FOUND!")
|
||||
39
test/external/external_test_usb_asm24.py
vendored
Normal file
39
test/external/external_test_usb_asm24.py
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
|||
import unittest, time
|
||||
from tinygrad.runtime.support.usb import ASM24Controller
|
||||
|
||||
class TestASMController(unittest.TestCase):
|
||||
@classmethod
|
||||
def setUpClass(cls):
|
||||
cls.ctrl = ASM24Controller()
|
||||
|
||||
def test_write_and_read(self):
|
||||
base = 0xF000
|
||||
data = b"hello!"
|
||||
self.ctrl.write(base, data)
|
||||
out = self.ctrl.read(base, len(data))
|
||||
self.assertEqual(out, data)
|
||||
|
||||
def test_scsi_write_and_read_from_f000(self):
|
||||
payload = bytes([0x5B]) * 4096
|
||||
self.ctrl.scsi_write(payload, lba=0)
|
||||
back = self.ctrl.read(0xF000, len(payload))
|
||||
self.assertEqual(back, payload)
|
||||
|
||||
def test_scsi_write_speed_4k(self):
|
||||
payload = bytes([0x5A]) * 4096
|
||||
start = time.perf_counter()
|
||||
self.ctrl.scsi_write(payload, lba=0)
|
||||
dur_ms = (time.perf_counter() - start) * 1000
|
||||
print(f"scsi_write 4K took {dur_ms:.3f} ms")
|
||||
|
||||
def test_read_speed_4k(self):
|
||||
payload = bytes([0xA5]) * 4096
|
||||
self.ctrl.write(0xF000, payload)
|
||||
start = time.perf_counter()
|
||||
out = self.ctrl.read(0xF000, 4096)
|
||||
dur_ms = (time.perf_counter() - start) * 1000
|
||||
print(f"read 4K took {dur_ms:.3f} ms")
|
||||
self.assertEqual(out, payload)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
13
test/mockgpu/usb.py
Normal file
13
test/mockgpu/usb.py
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
class MockUSB:
|
||||
def __init__(self, mem):
|
||||
self.mem = mem
|
||||
|
||||
def read(self, address, size):
|
||||
return bytes(self.mem[address:address+size])
|
||||
|
||||
def write(self, address, data):
|
||||
self.mem[address:address+len(data)] = data
|
||||
|
||||
def pcie_mem_req(self, address, value=None, size=1):
|
||||
if value is None: return int.from_bytes(self.mem[address:address+size], "little")
|
||||
else: self.mem[address:address+size] = value.to_bytes(size, "little")
|
||||
|
|
@ -1,6 +1,8 @@
|
|||
import unittest, array, time
|
||||
from tinygrad.helpers import mv_address
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface
|
||||
from tinygrad.runtime.support.usb import USBMMIOInterface
|
||||
from test.mockgpu.usb import MockUSB
|
||||
|
||||
class TestHCQIface(unittest.TestCase):
|
||||
def setUp(self):
|
||||
|
|
@ -51,5 +53,53 @@ class TestHCQIface(unittest.TestCase):
|
|||
mvend = time.perf_counter()
|
||||
print(f"speed: hcq {end - start:.6f}s vs plain mv {mvend - mvstart:.6f}s")
|
||||
|
||||
class TestUSBMMIOInterface(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.size = 256
|
||||
self.buffer = bytearray(self.size)
|
||||
self.usb = MockUSB(self.buffer)
|
||||
self.mmio = USBMMIOInterface(self.usb, 0, self.size, fmt='B', pcimem=False)
|
||||
|
||||
def test_getitem_setitem_byte(self):
|
||||
self.mmio[1] = 0xAB
|
||||
self.assertEqual(self.mmio[1], bytes([0xAB]))
|
||||
self.assertEqual(self.usb.mem[1], 0xAB)
|
||||
|
||||
def test_slice_getitem_setitem(self):
|
||||
values = [1, 2, 3, 4]
|
||||
self.mmio[10:14] = values
|
||||
raw = self.mmio[10:14]
|
||||
self.assertIsInstance(raw, bytes)
|
||||
self.assertEqual(list(raw), values)
|
||||
self.assertEqual(list(self.usb.mem[10:14]), values)
|
||||
|
||||
def test_view(self):
|
||||
self.mmio[0] = 5
|
||||
view = self.mmio.view(offset=1, size=3)
|
||||
self.assertEqual(view[0], bytes([self.usb.mem[1]]))
|
||||
view[:] = [7, 8, 9]
|
||||
self.assertEqual(list(self.usb.mem[1:4]), [7, 8, 9])
|
||||
full_view = self.mmio.view()
|
||||
self.assertEqual(len(full_view), len(self.mmio))
|
||||
self.mmio[2] = 0xFE
|
||||
self.assertEqual(full_view[2], bytes([0xFE]))
|
||||
|
||||
def test_pcimem_byte(self):
|
||||
usb2 = MockUSB(bytearray(self.size))
|
||||
mmio_pci = USBMMIOInterface(usb2, 0, self.size, fmt='B', pcimem=True)
|
||||
mmio_pci[3] = 0x11
|
||||
self.assertEqual(mmio_pci[3], 0x11)
|
||||
self.assertEqual(usb2.mem[3], 0x11)
|
||||
|
||||
def test_pcimem_slice(self):
|
||||
usb3 = MockUSB(bytearray(self.size))
|
||||
mmio_pci = USBMMIOInterface(usb3, 0, self.size, fmt='B', pcimem=True)
|
||||
values = [2, 3, 4]
|
||||
mmio_pci[4:7] = values
|
||||
raw = mmio_pci[4:7]
|
||||
self.assertIsInstance(raw, bytes)
|
||||
self.assertEqual(list(raw), values)
|
||||
self.assertEqual([mmio_pci[i] for i in range(4, 7)], values)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
|
|||
1643
tinygrad/runtime/autogen/libusb.py
Normal file
1643
tinygrad/runtime/autogen/libusb.py
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -15,7 +15,8 @@ from tinygrad.runtime.autogen.am import am
|
|||
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
|
||||
from tinygrad.runtime.support.elf import elf_loader
|
||||
from tinygrad.runtime.support.am.amdev import AMDev, AMMapping
|
||||
from tinygrad.runtime.support.amd import AMDRegBase, collect_registers, import_module
|
||||
from tinygrad.runtime.support.amd import AMDRegBase, collect_registers, import_module, setup_pci_bars
|
||||
from tinygrad.runtime.support.usb import ASM24Controller, USBMMIOInterface
|
||||
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint: disable=unused-import
|
||||
|
||||
EVENT_INDEX_PARTIAL_FLUSH = 4 # based on a comment in nvd.h
|
||||
|
|
@ -461,6 +462,8 @@ class AMDProgram(HCQProgram):
|
|||
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
||||
|
||||
class AMDAllocator(HCQAllocator['AMDDevice']):
|
||||
def __init__(self, dev:AMDDevice): super().__init__(dev, copy_bufs=getattr(dev.dev_iface, 'copy_bufs', None))
|
||||
|
||||
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
||||
return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
|
||||
|
||||
|
|
@ -722,14 +725,17 @@ class PCIIface:
|
|||
bar_info = FileIOInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
|
||||
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
|
||||
|
||||
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2, fmt='Q'), self._map_pci_range(5, fmt='I'))
|
||||
self.ip_versions = self.adev.ip_ver
|
||||
self.ip_offsets = {hwip: tuple(instances[0]) for hwip,instances in self.adev.regs_offset.items()}
|
||||
self._setup_adev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2, fmt='Q'), self._map_pci_range(5, fmt='I'))
|
||||
self.doorbell_cpu_addr = dbell.addr
|
||||
|
||||
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
|
||||
self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
|
||||
|
||||
def _setup_adev(self, name, vram:MMIOInterface, doorbell:MMIOInterface, mmio:MMIOInterface):
|
||||
self.adev = AMDev(name, vram, doorbell, mmio)
|
||||
self.ip_versions = self.adev.ip_ver
|
||||
self.ip_offsets = {hwip: tuple(instances[0]) for hwip,instances in self.adev.regs_offset.items()}
|
||||
|
||||
gfxver = int(f"{self.adev.ip_ver[am.GC_HWIP][0]:02d}{self.adev.ip_ver[am.GC_HWIP][1]:02d}{self.adev.ip_ver[am.GC_HWIP][2]:02d}")
|
||||
array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
|
||||
simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
|
||||
|
|
@ -796,16 +802,50 @@ class PCIIface:
|
|||
|
||||
def device_fini(self): self.adev.fini()
|
||||
|
||||
class USBIface(PCIIface):
|
||||
def __init__(self, dev, dev_id):
|
||||
self.dev = dev
|
||||
self.usb = ASM24Controller()
|
||||
self.bars = setup_pci_bars(self.usb, gpu_bus=4, mem_base=0x40000000, pref_mem_base=0x10000000)
|
||||
|
||||
self._setup_adev(f"usb:{dev_id}", USBMMIOInterface(self.usb, *self.bars[0], fmt='B'), USBMMIOInterface(self.usb, *self.bars[2], fmt='Q'),
|
||||
USBMMIOInterface(self.usb, *self.bars[5], fmt='I'))
|
||||
|
||||
# special regions
|
||||
copy_vaddr = self.adev.mm.alloc_vaddr(size=0x1000, align=0x1000)
|
||||
self.copy_region = self.adev.mm.map_range(copy_vaddr, 0x1000, [(0x200000, 0x1000)], system=True, snooped=False, uncached=True)
|
||||
self.copy_bufs = [HCQBuffer(copy_vaddr, 0x1000, meta=AMAllocationMeta(self.dev, [self.dev], self.copy_region, has_cpu_mapping=False),
|
||||
view=USBMMIOInterface(self.usb, 0xf000, 0x1000, fmt='B', pcimem=False))]
|
||||
|
||||
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
|
||||
am_mapping = self.adev.mm.valloc(size:=round_up(size, 4 << 10), uncached=uncached, contigous=cpu_access)
|
||||
return HCQBuffer(am_mapping.va_addr, size, meta=AMAllocationMeta(self.dev, [self.dev], am_mapping, has_cpu_mapping=False),
|
||||
view=USBMMIOInterface(self.usb, self.bars[0][0] + am_mapping.paddrs[0][0], size, fmt='B') if cpu_access else None)
|
||||
|
||||
def create_queue(self, queue_type, ring, gart, eop_buffer=None, cwsr_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, xcc_id=0):
|
||||
if queue_type == kfd.KFD_IOC_QUEUE_TYPE_SDMA:
|
||||
self.adev.sdma.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
||||
doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_sDMA_ENGINE0), pipe=0, queue=0)
|
||||
else:
|
||||
self.adev.gfx.setup_ring(ring_addr=ring.va_addr, ring_size=ring.size, rptr_addr=gart.va_addr, wptr_addr=gart.va_addr+0x10,
|
||||
eop_addr=eop_buffer.va_addr, eop_size=eop_buffer.size, doorbell=(doorbell_index:=am.AMDGPU_NAVI10_DOORBELL_MEC_RING0), pipe=0, queue=0)
|
||||
|
||||
return AMDQueueDesc(ring=self.adev.vram.view(ring.meta.mapping.paddrs[0][0], 0x8000, fmt='I'),
|
||||
doorbells=[self.adev.doorbell64.view(doorbell_index * 8, 8, fmt='Q')],
|
||||
read_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0], 8, fmt='Q')],
|
||||
write_ptrs=[self.adev.vram.view(gart.meta.mapping.paddrs[0][0]+0x10, 8, fmt='Q')])
|
||||
|
||||
class AMDDevice(HCQCompiled):
|
||||
devices: ClassVar[list[HCQCompiled]] = []
|
||||
signal_pages: ClassVar[list[HCQBuffer]] = []
|
||||
signal_pool: ClassVar[list[HCQBuffer]] = []
|
||||
|
||||
def is_am(self) -> bool: return isinstance(self.dev_iface, PCIIface)
|
||||
def is_am(self) -> bool: return isinstance(self.dev_iface, (PCIIface, USBIface))
|
||||
def is_usb(self) -> bool: return isinstance(self.dev_iface, USBIface)
|
||||
|
||||
def _select_iface(self):
|
||||
errs:str = ""
|
||||
for iface_t in (KFDIface, PCIIface) if len(nm:=getenv("AMD_IFACE", "")) == 0 else (getattr(sys.modules[__name__], f"{nm}Iface"),):
|
||||
for iface_t in (KFDIface, PCIIface, USBIface) if len(nm:=getenv("AMD_IFACE", "")) == 0 else (getattr(sys.modules[__name__], f"{nm}Iface"),):
|
||||
try: return iface_t(self, self.device_id)
|
||||
except Exception as e: errs += f"\n{iface_t.__name__}: {type(e).__name__}: {e}"
|
||||
raise RuntimeError(f"Cannot find a usable interface for AMD:{self.device_id}:{errs}")
|
||||
|
|
@ -844,15 +884,16 @@ class AMDDevice(HCQCompiled):
|
|||
nbio_pad = (0,) if self.target[0] == 9 else ()
|
||||
self.nbio = AMDIP('nbio' if self.target[0]<12 else 'nbif', nbio_ver, nbio_pad+self.dev_iface.ip_offsets[am.NBIF_HWIP])
|
||||
|
||||
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x800000, ctx_save_restore_size=wg_data_size + ctl_stack_size,
|
||||
eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
self.compute_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_COMPUTE, 0x8000 if self.is_usb() else 0x800000,
|
||||
ctx_save_restore_size=wg_data_size + ctl_stack_size, eop_buffer_size=0x1000, ctl_stack_size=ctl_stack_size, debug_memory_size=debug_memory_size)
|
||||
|
||||
max_copy_size = 0x40000000 if self.dev_iface.ip_versions[am.SDMA0_HWIP][0] >= 5 else 0x400000
|
||||
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x800000)
|
||||
self.sdma_queue = self.create_queue(kfd.KFD_IOC_QUEUE_TYPE_SDMA, 0x8000 if self.is_usb() else 0x800000)
|
||||
|
||||
super().__init__(device, AMDAllocator(self), AMDLLVMRenderer(self.arch) if getenv("AMD_LLVM", 0) else AMDRenderer(self.arch),
|
||||
AMDLLVMCompiler(self.arch) if getenv("AMD_LLVM", 0) else HIPCompiler(self.arch), functools.partial(AMDProgram, self),
|
||||
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size))
|
||||
AMDSignal, functools.partial(AMDComputeQueue, self), functools.partial(AMDCopyQueue, self, max_copy_size=max_copy_size),
|
||||
kernargs_size=(8 << 10) if self.is_usb() else (16 << 20))
|
||||
|
||||
# Scratch setup
|
||||
self.max_private_segment_size = 0
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from __future__ import annotations
|
||||
import ctypes, collections, time, dataclasses, functools, fcntl, os, hashlib
|
||||
import ctypes, collections, time, dataclasses, functools, fcntl, os, hashlib, array
|
||||
from tinygrad.helpers import mv_address, getenv, round_up, DEBUG, temp, fetch
|
||||
from tinygrad.runtime.autogen.am import am, mp_11_0
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface
|
||||
|
|
@ -352,13 +352,24 @@ class AMDev:
|
|||
time.sleep(0.001)
|
||||
raise RuntimeError(f'wait_reg timeout reg=0x{reg.addr:X} mask=0x{mask:X} value=0x{value:X} last_val=0x{rval}')
|
||||
|
||||
def _read_vram(self, addr, size) -> bytes:
|
||||
assert addr % 4 == 0 and size % 4 == 0, f"Invalid address {addr:#x} or size {size:#x}"
|
||||
res = []
|
||||
for caddr in range(addr, addr + size, 4):
|
||||
self.wreg(0x06, caddr >> 31)
|
||||
self.wreg(0x00, (caddr & 0x7FFFFFFF) | 0x80000000)
|
||||
res.append(self.rreg(0x01))
|
||||
return bytes(array.array('I', res))
|
||||
|
||||
def _run_discovery(self):
|
||||
# NOTE: Fixed register to query memory size without known ip bases to find the discovery table.
|
||||
# The table is located at the end of VRAM - 64KB and is 10KB in size.
|
||||
mmRCC_CONFIG_MEMSIZE = 0xde3
|
||||
self.vram_size = self.rreg(mmRCC_CONFIG_MEMSIZE) << 20
|
||||
tmr_offset, tmr_size = self.vram_size - (64 << 10), (10 << 10)
|
||||
|
||||
self.bhdr = am.struct_binary_header.from_buffer(bytearray(self.vram.view(self.vram_size - (64 << 10), (10 << 10))[:]))
|
||||
disc_tbl = self._read_vram(tmr_offset, tmr_size) if self.vram.nbytes < self.vram_size else self.vram.view(tmr_offset, tmr_size)[:]
|
||||
self.bhdr = am.struct_binary_header.from_buffer(bytearray(disc_tbl))
|
||||
ihdr = am.struct_ip_discovery_header.from_address(ctypes.addressof(self.bhdr) + self.bhdr.table_list[am.IP_DISCOVERY].offset)
|
||||
assert ihdr.signature == am.DISCOVERY_TABLE_SIGNATURE and not ihdr.base_addr_64_bit, f"0x{ihdr.signature:X} != 0x{am.DISCOVERY_TABLE_SIGNATURE:X}"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,9 @@
|
|||
import functools, importlib
|
||||
import functools, importlib, time
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from tinygrad.helpers import getbits
|
||||
from tinygrad.helpers import getbits, round_up
|
||||
from tinygrad.runtime.autogen import pci
|
||||
from tinygrad.runtime.support.usb import ASM24Controller
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class AMDRegBase:
|
||||
|
|
@ -29,3 +31,39 @@ def import_module(name:str, version:tuple[int, ...], version_prefix:str=""):
|
|||
try: return importlib.import_module(f"tinygrad.runtime.autogen.am.{name}_{version_prefix}{'_'.join(map(str, ver))}")
|
||||
except ImportError: pass
|
||||
raise ImportError(f"Failed to load autogen module for {name.upper()} {'.'.join(map(str, version))}")
|
||||
|
||||
def setup_pci_bars(usb:ASM24Controller, gpu_bus:int, mem_base:int, pref_mem_base:int) -> dict[int, tuple[int, int]]:
|
||||
try: usb.pcie_cfg_req(pci.PCI_VENDOR_ID, bus=gpu_bus, dev=0, fn=0, size=2)
|
||||
except RuntimeError:
|
||||
for bus in range(gpu_bus):
|
||||
usb.pcie_cfg_req(pci.PCI_SUBORDINATE_BUS, bus=bus, dev=0, fn=0, value=gpu_bus, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_SECONDARY_BUS, bus=bus, dev=0, fn=0, value=bus+1, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_PRIMARY_BUS, bus=bus, dev=0, fn=0, value=max(0, bus-1), size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_BASE, bus=bus, dev=0, fn=0, value=mem_base>>16, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=0xf000, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_BASE, bus=bus, dev=0, fn=0, value=pref_mem_base>>16, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_PREF_MEMORY_LIMIT, bus=bus, dev=0, fn=0, value=mem_base>>16, size=2)
|
||||
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=pci.PCI_BRIDGE_CTL_BUS_RESET, size=1)
|
||||
time.sleep(0.1)
|
||||
usb.pcie_cfg_req(pci.PCI_BRIDGE_CONTROL, bus=bus, dev=0, fn=0, value=0x0, size=1)
|
||||
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
|
||||
|
||||
mem_space_addr, bar_off, bars = [mem_base, pref_mem_base], 0, {}
|
||||
while bar_off < 24:
|
||||
cfg = usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, size=4)
|
||||
bar_mem, bar_space = bool(cfg & pci.PCI_BASE_ADDRESS_MEM_PREFETCH), cfg & pci.PCI_BASE_ADDRESS_SPACE
|
||||
|
||||
if bar_space == pci.PCI_BASE_ADDRESS_SPACE_MEMORY:
|
||||
usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, value=0xffffffff, size=4)
|
||||
bar_size = 0xffffffff - (usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, size=4) & 0xfffffff0) + 1
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off, bus=gpu_bus, dev=0, fn=0, value=mem_space_addr[bar_mem], size=4)
|
||||
bars[bar_off // 4] = (mem_space_addr[bar_mem], bar_size)
|
||||
mem_space_addr[bar_mem] += round_up(bar_size, 2 << 20)
|
||||
|
||||
# 64bit bar, zero out the upper 32 bits
|
||||
if bar_space == pci.PCI_BASE_ADDRESS_MEM_TYPE_64: usb.pcie_cfg_req(pci.PCI_BASE_ADDRESS_0 + bar_off + 4, bus=gpu_bus, dev=0, fn=0, value=0,size=4)
|
||||
bar_off += 8 if cfg & pci.PCI_BASE_ADDRESS_MEM_TYPE_64 else 4
|
||||
|
||||
usb.pcie_cfg_req(pci.PCI_COMMAND, bus=gpu_bus, dev=0, fn=0, value=pci.PCI_COMMAND_IO | pci.PCI_COMMAND_MEMORY | pci.PCI_COMMAND_MASTER, size=1)
|
||||
return bars
|
||||
|
|
|
|||
243
tinygrad/runtime/support/usb.py
Normal file
243
tinygrad/runtime/support/usb.py
Normal file
|
|
@ -0,0 +1,243 @@
|
|||
import ctypes, struct, dataclasses, array
|
||||
from typing import Sequence
|
||||
from tinygrad.runtime.autogen import libusb
|
||||
from tinygrad.helpers import DEBUG
|
||||
from tinygrad.runtime.support.hcq import MMIOInterface
|
||||
|
||||
class USB3:
|
||||
def __init__(self, vendor:int, dev:int, ep_data_in:int, ep_stat_in:int, ep_data_out:int, ep_cmd_out:int, max_streams:int=16, max_read_len:int=4096):
|
||||
self.vendor, self.dev = vendor, dev
|
||||
self.ep_data_in, self.ep_stat_in, self.ep_data_out, self.ep_cmd_out = ep_data_in, ep_stat_in, ep_data_out, ep_cmd_out
|
||||
self.max_streams, self.max_read_len = max_streams, max_read_len
|
||||
self.ctx = ctypes.POINTER(libusb.struct_libusb_context)()
|
||||
|
||||
if libusb.libusb_init(ctypes.byref(self.ctx)): raise RuntimeError("libusb_init failed")
|
||||
if DEBUG >= 6: libusb.libusb_set_option(self.ctx, libusb.LIBUSB_OPTION_LOG_LEVEL, 4)
|
||||
|
||||
self.handle = libusb.libusb_open_device_with_vid_pid(self.ctx, self.vendor, self.dev)
|
||||
if not self.handle: raise RuntimeError(f"device {self.vendor:04x}:{self.dev:04x} not found. sudo required?")
|
||||
|
||||
# Detach kernel driver if needed
|
||||
if libusb.libusb_kernel_driver_active(self.handle, 0):
|
||||
libusb.libusb_detach_kernel_driver(self.handle, 0)
|
||||
libusb.libusb_reset_device(self.handle)
|
||||
|
||||
# Set configuration and claim interface
|
||||
if libusb.libusb_set_configuration(self.handle, 1): raise RuntimeError("set_configuration failed")
|
||||
if libusb.libusb_claim_interface(self.handle, 0): raise RuntimeError("claim_interface failed. sudo required?")
|
||||
if libusb.libusb_set_interface_alt_setting(self.handle, 0, 1): raise RuntimeError("alt_setting failed")
|
||||
|
||||
# Clear any stalled endpoints
|
||||
all_eps = (self.ep_data_out, self.ep_data_in, self.ep_stat_in, self.ep_cmd_out)
|
||||
for ep in all_eps: libusb.libusb_clear_halt(self.handle, ep)
|
||||
|
||||
# Allocate streams
|
||||
stream_eps = (ctypes.c_uint8 * 3)(self.ep_data_out, self.ep_data_in, self.ep_stat_in)
|
||||
if (rc:=libusb.libusb_alloc_streams(self.handle, self.max_streams * len(stream_eps), stream_eps, len(stream_eps))) < 0:
|
||||
raise RuntimeError(f"alloc_streams failed: {rc}")
|
||||
|
||||
# Base cmd
|
||||
cmd_template = bytes([0x01, 0x00, 0x00, 0x01, *([0] * 12), 0xE4, 0x24, 0x00, 0xB2, 0x1A, 0x00, 0x00, 0x00, *([0] * 8)])
|
||||
|
||||
# Init pools
|
||||
self.tr = {ep: [libusb.libusb_alloc_transfer(0) for _ in range(self.max_streams)] for ep in all_eps}
|
||||
|
||||
self.buf_cmd = [(ctypes.c_uint8 * len(cmd_template))(*cmd_template) for _ in range(self.max_streams)]
|
||||
self.buf_stat = [(ctypes.c_uint8 * 64)() for _ in range(self.max_streams)]
|
||||
self.buf_data_in = [(ctypes.c_uint8 * 0x1000)() for _ in range(self.max_streams)]
|
||||
self.buf_data_out = [(ctypes.c_uint8 * 0x1000)() for _ in range(self.max_streams)]
|
||||
|
||||
def _prep_transfer(self, tr, ep, stream_id, buf, length):
|
||||
tr.contents.dev_handle, tr.contents.endpoint, tr.contents.length, tr.contents.buffer = self.handle, ep, length, buf
|
||||
tr.contents.status, tr.contents.flags, tr.contents.timeout, tr.contents.num_iso_packets = 0xff, 0, 1000, 0
|
||||
tr.contents.type = (libusb.LIBUSB_TRANSFER_TYPE_BULK_STREAM if stream_id is not None else libusb.LIBUSB_TRANSFER_TYPE_BULK)
|
||||
if stream_id is not None: libusb.libusb_transfer_set_stream_id(tr, stream_id)
|
||||
return tr
|
||||
|
||||
def _submit_and_wait(self, cmds):
|
||||
for tr in cmds: libusb.libusb_submit_transfer(tr)
|
||||
|
||||
running = len(cmds)
|
||||
while running:
|
||||
libusb.libusb_handle_events(self.ctx)
|
||||
running = len(cmds)
|
||||
for tr in cmds:
|
||||
if tr.contents.status == libusb.LIBUSB_TRANSFER_COMPLETED: running -= 1
|
||||
elif tr.contents.status != 0xFF: raise RuntimeError(f"EP 0x{tr.contents.endpoint:02X} error: {tr.contents.status}")
|
||||
|
||||
def send_batch(self, cdbs:list[bytes], idata:list[int]|None=None, odata:list[bytes|None]|None=None) -> list[bytes|None]:
|
||||
idata, odata = idata or [0] * len(cdbs), odata or [None] * len(cdbs)
|
||||
results, tr_window, op_window = [], [], []
|
||||
|
||||
for idx, (cdb, rlen, send_data) in enumerate(zip(cdbs, idata, odata)):
|
||||
# allocate slot and stream. stream is 1-based
|
||||
slot, stream = idx % self.max_streams, (idx % self.max_streams) + 1
|
||||
|
||||
# build cmd packet
|
||||
struct.pack_into(">B", self.buf_cmd[slot], 3, stream)
|
||||
self.buf_cmd[slot][16:16+len(cdb)] = list(cdb)
|
||||
|
||||
# cmd + stat transfers
|
||||
tr_window.append(self._prep_transfer(self.tr[self.ep_cmd_out][slot], self.ep_cmd_out, None, self.buf_cmd[slot], len(self.buf_cmd[slot])))
|
||||
tr_window.append(self._prep_transfer(self.tr[self.ep_stat_in][slot], self.ep_stat_in, stream, self.buf_stat[slot], 64))
|
||||
|
||||
if rlen:
|
||||
if rlen > self.max_read_len: raise ValueError("read length > max_read_len per CDB")
|
||||
tr_window.append(self._prep_transfer(self.tr[self.ep_data_in][slot], self.ep_data_in, stream, self.buf_data_in[slot], rlen))
|
||||
|
||||
if send_data is not None:
|
||||
if len(send_data) > len(self.buf_data_out[slot]): self.buf_data_out[slot] = (ctypes.c_uint8 * len(send_data))()
|
||||
self.buf_data_out[slot][:len(send_data)] = list(send_data)
|
||||
tr_window.append(self._prep_transfer(self.tr[self.ep_data_out][slot], self.ep_data_out, stream, self.buf_data_out[slot], len(send_data)))
|
||||
|
||||
op_window.append((idx, slot, rlen))
|
||||
if (idx + 1 == len(cdbs)) or len(op_window) >= self.max_streams:
|
||||
self._submit_and_wait(tr_window)
|
||||
for idx, slot, rlen in op_window: results.append(bytes(self.buf_data_in[slot][:rlen]) if rlen else None)
|
||||
tr_window = []
|
||||
|
||||
return results
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class WriteOp: addr:int; data:bytes; ignore_cache:bool=True # noqa: E702
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class ReadOp: addr:int; size:int # noqa: E702
|
||||
|
||||
@dataclasses.dataclass(frozen=True)
|
||||
class ScsiWriteOp: data:bytes; lba:int=0 # noqa: E702
|
||||
|
||||
class ASM24Controller:
|
||||
def __init__(self):
|
||||
self.usb = USB3(0xADD1, 0x0001, 0x81, 0x83, 0x02, 0x04)
|
||||
self._cache: dict[int, int|None] = {}
|
||||
|
||||
# Init controller.
|
||||
self.exec_ops([WriteOp(0x54b, b' '), WriteOp(0x5a8, b'\x02'), WriteOp(0x5f8, b'\x04'), WriteOp(0x7ec, b'\x01\x00\x00\x00'),
|
||||
WriteOp(0xc422, b'\x02'), WriteOp(0x0, b'\x33')])
|
||||
|
||||
def exec_ops(self, ops:Sequence[WriteOp|ReadOp|ScsiWriteOp]):
|
||||
cdbs:list[bytes] = []
|
||||
idata:list[int] = []
|
||||
odata:list[bytes|None] = []
|
||||
|
||||
def _add_req(cdb:bytes, i:int, o:bytes|None):
|
||||
nonlocal cdbs, idata, odata
|
||||
cdbs, idata, odata = cdbs + [cdb], idata + [i], odata + [o]
|
||||
|
||||
for op in ops:
|
||||
if isinstance(op, WriteOp):
|
||||
for off, value in enumerate(op.data):
|
||||
addr = ((op.addr + off) & 0x1FFFF) | 0x500000
|
||||
if not op.ignore_cache and self._cache.get(addr) == value: continue
|
||||
_add_req(struct.pack('>BBBHB', 0xE5, value, addr >> 16, addr & 0xFFFF, 0), 0, None)
|
||||
self._cache[addr] = value
|
||||
elif isinstance(op, ReadOp):
|
||||
assert op.size <= 0xff
|
||||
addr = (op.addr & 0x1FFFF) | 0x500000
|
||||
_add_req(struct.pack('>BBBHB', 0xE4, op.size, addr >> 16, addr & 0xFFFF, 0), op.size, None)
|
||||
for i in range(op.size): self._cache[addr + i] = None
|
||||
elif isinstance(op, ScsiWriteOp): _add_req(struct.pack('>BBQIBB', 0x8A, 0, op.lba, 4096//512, 0, 0), 0, op.data+b'\x00'*(4096-len(op.data)))
|
||||
|
||||
return self.usb.send_batch(cdbs, idata, odata)
|
||||
|
||||
def write(self, base_addr:int, data:bytes, ignore_cache:bool=True): return self.exec_ops([WriteOp(base_addr, data, ignore_cache)])
|
||||
|
||||
def scsi_write(self, buf:bytes, lba:int=0):
|
||||
self.exec_ops([ScsiWriteOp(buf, lba), WriteOp(0x171, b'\xff\xff\xff', ignore_cache=True), WriteOp(0xce6e, b'\x00\x00', ignore_cache=True)])
|
||||
|
||||
def read(self, base_addr:int, length:int, stride:int=0xff) -> bytes:
|
||||
parts = self.exec_ops([ReadOp(base_addr + off, min(stride, length - off)) for off in range(0, length, stride)])
|
||||
return b''.join(p or b'' for p in parts)[:length]
|
||||
|
||||
def pcie_request(self, fmt_type, address, value=None, size=4, cnt=10):
|
||||
assert fmt_type >> 8 == 0 and size > 0 and size <= 4, f"Invalid fmt_type {fmt_type} or size {size}"
|
||||
if DEBUG >= 3: print("pcie_request", hex(fmt_type), hex(address), value, size, cnt)
|
||||
|
||||
masked_address, offset = address & 0xFFFFFFFC, address & 0x3
|
||||
assert size + offset <= 4
|
||||
|
||||
ops = []
|
||||
if value is not None:
|
||||
assert value >> (8 * size) == 0
|
||||
ops.append(WriteOp(0xB220, struct.pack('>I', value << (8 * offset)), ignore_cache=False))
|
||||
|
||||
ops += [WriteOp(0xB218, struct.pack('>I', masked_address), ignore_cache=False),
|
||||
WriteOp(0xB217, bytes([((1 << size) - 1) << offset]), ignore_cache=False),
|
||||
WriteOp(0xB210, bytes([fmt_type]), ignore_cache=False),
|
||||
WriteOp(0xB254, b"\x0f", ignore_cache=True), WriteOp(0xB296, b"\x04", ignore_cache=True)]
|
||||
self.exec_ops(ops)
|
||||
|
||||
# Fast path for write requests
|
||||
if ((fmt_type & 0b11011111) == 0b01000000) or ((fmt_type & 0b10111000) == 0b00110000): return
|
||||
|
||||
while (stat:=self.read(0xB296, 1)[0]) & 2 == 0:
|
||||
if stat & 1:
|
||||
self.write(0xB296, bytes([0x01]))
|
||||
if cnt > 0: return self.pcie_request(fmt_type, address, value, size, cnt=cnt-1)
|
||||
assert stat == 2, f"stat read 2 was {stat}"
|
||||
|
||||
# Retrieve completion data from Link Status (0xB22A, 0xB22B)
|
||||
b284 = self.read(0xB284, 1)[0]
|
||||
completion = struct.unpack('>H', self.read(0xB22A, 2))
|
||||
|
||||
# Validate completion status based on PCIe request typ
|
||||
# Completion TLPs for configuration requests always have a byte count of 4.
|
||||
assert completion[0] & 0xfff == (4 if (fmt_type & 0xbe == 0x04) else size)
|
||||
|
||||
# Extract completion status field
|
||||
status = (completion[0] >> 13) & 0x7
|
||||
|
||||
# Handle completion errors or inconsistencies
|
||||
if status or ((fmt_type & 0xbe == 0x04) and (((value is None) and (not (b284 & 0x01))) or ((value is not None) and (b284 & 0x01)))):
|
||||
status_map = {0b000: "Successful Completion (SC)", 0b001: "Unsupported Request (UR)",
|
||||
0b010: "Configuration Request Retry Status (CRS)", 0b100: "Completer Abort (CA)"}
|
||||
raise RuntimeError("Completion status: {}, 0xB284 bit 0: {}".format(status_map.get(status, "Reserved (0b{:03b})".format(status)), b284 & 0x01))
|
||||
|
||||
if value is None: return (struct.unpack('>I', self.read(0xB220, 4))[0] >> (8 * offset)) & ((1 << (8 * size)) - 1)
|
||||
|
||||
def pcie_cfg_req(self, byte_addr, bus=1, dev=0, fn=0, value=None, size=4):
|
||||
assert byte_addr >> 12 == 0 and bus >> 8 == 0 and dev >> 5 == 0 and fn >> 3 == 0, f"Invalid byte_addr {byte_addr}, bus {bus}, dev {dev}, fn {fn}"
|
||||
|
||||
fmt_type = (0x44 if value is not None else 0x4) | int(bus > 0)
|
||||
address = (bus << 24) | (dev << 19) | (fn << 16) | (byte_addr & 0xfff)
|
||||
return self.pcie_request(fmt_type, address, value, size)
|
||||
|
||||
def pcie_mem_req(self, address, value=None, size=4): return self.pcie_request(0x40 if value is not None else 0x0, address, value, size)
|
||||
|
||||
class USBMMIOInterface(MMIOInterface):
|
||||
def __init__(self, usb, addr, size, fmt, pcimem=True):
|
||||
self.usb, self.addr, self.nbytes, self.fmt, self.pcimem, self.el_sz = usb, addr, size, fmt, pcimem, struct.calcsize(fmt)
|
||||
|
||||
def __getitem__(self, index): return self._access_items(index)
|
||||
def __setitem__(self, index, val): self._access_items(index, val)
|
||||
|
||||
def _access_items(self, index, val=None):
|
||||
if isinstance(index, slice): return self._acc((index.start or 0) * self.el_sz, ((index.stop or len(self))-(index.start or 0)) * self.el_sz, val)
|
||||
return self._acc_one(index * self.el_sz, self.el_sz, val) if self.pcimem else self._acc(index * self.el_sz, self.el_sz, val)
|
||||
|
||||
def view(self, offset:int=0, size:int|None=None, fmt=None):
|
||||
return USBMMIOInterface(self.usb, self.addr+offset, size or (self.nbytes - offset), fmt=fmt or self.fmt, pcimem=self.pcimem)
|
||||
|
||||
def _acc_size(self, sz): return next(x for x in [('I', 4), ('H', 2), ('B', 1)] if sz % x[1] == 0)
|
||||
|
||||
def _acc_one(self, off, sz, val=None):
|
||||
upper = 0 if sz < 8 else self.usb.pcie_mem_req(self.addr + off + 4, val if val is None else (val >> 32), 4)
|
||||
lower = self.usb.pcie_mem_req(self.addr + off, val if val is None else val & 0xffffffff, min(sz, 4))
|
||||
if val is None: return lower | (upper << 32)
|
||||
|
||||
def _acc(self, off, sz, data=None):
|
||||
if data is None: # read op
|
||||
if not self.pcimem: return self.usb.read(self.addr + off, sz)
|
||||
|
||||
acc, acc_size = self._acc_size(sz)
|
||||
return bytes(array.array(acc, [self._acc_one(off + i * acc_size, acc_size) for i in range(sz // acc_size)]))
|
||||
else: # write op
|
||||
data = struct.pack(self.fmt, data) if isinstance(data, int) else bytes(data)
|
||||
|
||||
if not self.pcimem:
|
||||
# Fast path for writing into buffer 0xf000
|
||||
return self.usb.scsi_write(bytes(data)) if self.addr == 0xf000 else self.usb.write(self.addr + off, bytes(data))
|
||||
|
||||
_, acc_sz = self._acc_size(len(data) * struct.calcsize(self.fmt))
|
||||
for i in range(0, len(data), acc_sz): self._acc_one(off + i, acc_sz, int.from_bytes(data[i:i+acc_sz], "little"))
|
||||
Loading…
Add table
Add a link
Reference in a new issue