mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
fix(egpu): program BAR addresses for Thunderbolt eGPUs on Apple Silicon
On macOS Apple Silicon, Thunderbolt-connected eGPUs don't get BAR addresses assigned by the OS. The BAR registers in PCI config space remain 0x00000000, causing _CopyDeviceMemoryWithIndex to return mappings that read as 0xFFFFFFFF (unmapped memory), making the GPU unusable for compute. This patch adds ProgramBARAddresses() to TinyGPUDriver::Start_Impl, which: 1. Checks if BAR0 is unassigned (0x00000000 or 0xFFFFFFFF) 2. Determines each BAR's size requirement by writing/reading 0xFFFFFFFF 3. Reads the parent Thunderbolt bridge's memory aperture (MEM base/limit, prefetchable MEM base/limit) from PCI config space 4. Allocates naturally-aligned BAR addresses from the bridge aperture 5. Writes the addresses to BAR registers in config space 6. Re-enables Memory Space and Bus Master in the command register 7. Logs all steps for debugging This mirrors what Linux does in pci_assign_resource() and what mac-amdgpu does manually for the R9700. Without this, the DriverKit extension can enumerate the GPU but cannot access its MMIO registers. Tested-on: AMD Radeon AI PRO R9700 32GB (RDNA4) on M4 Mac Mini via ACASIS G4Pro Thunderbolt 4 enclosure. Fixes: #15813, #15864, #16714 Related: #15730, #15744
This commit is contained in:
parent
7c1d0b6d9a
commit
ec93db06db
2 changed files with 214 additions and 2 deletions
|
|
@ -31,6 +31,209 @@ void TinyGPUDriver::free()
|
|||
super::free();
|
||||
}
|
||||
|
||||
void TinyGPUDriver::ProgramBARAddresses()
|
||||
{
|
||||
// On Apple Silicon, macOS may not assign BAR addresses for Thunderbolt eGPUs.
|
||||
// The BAR registers in PCI config space remain 0x00000000, causing
|
||||
// _CopyDeviceMemoryWithIndex to return mappings that read as 0xFFFFFFFF.
|
||||
//
|
||||
// This function checks if BARs are unassigned and programs them by:
|
||||
// 1. Determining each BAR's size requirement (write 0xFFFFFFFF, read back)
|
||||
// 2. Reading the parent Thunderbolt bridge's memory aperture
|
||||
// 3. Allocating BAR addresses from the aperture
|
||||
// 4. Writing the addresses to BAR registers
|
||||
// 5. Re-enabling Memory Space
|
||||
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: checking BAR assignments");
|
||||
|
||||
// Check if BAR0 is already assigned by the OS
|
||||
uint32_t bar0 = 0;
|
||||
ivars->pci->ConfigurationRead32(kIOPCIConfigurationOffsetBaseAddress0, &bar0);
|
||||
if (bar0 != 0 && bar0 != 0xFFFFFFFF) {
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR0 already assigned (0x%08x), skipping BAR programming", bar0);
|
||||
return;
|
||||
}
|
||||
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR0 unassigned (0x%08x), programming BAR addresses for eGPU", bar0);
|
||||
|
||||
// Disable memory and I/O access while programming BARs
|
||||
uint16_t cmd = 0;
|
||||
ivars->pci->ConfigurationRead16(kIOPCIConfigurationOffsetCommand, &cmd);
|
||||
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, cmd & ~(kIOPCICommandIOSpace | kIOPCICommandMemorySpace));
|
||||
|
||||
// Step 1: Determine BAR sizes
|
||||
uint64_t bar_sizes[6] = {};
|
||||
bool bar_64bit[6] = {};
|
||||
uint32_t bar_origins[6] = {};
|
||||
|
||||
for (int i = 0; i < 6; ) {
|
||||
uint32_t offset = kIOPCIConfigurationOffsetBaseAddress0 + i * 4;
|
||||
|
||||
// Save original
|
||||
ivars->pci->ConfigurationRead32(offset, &bar_origins[i]);
|
||||
|
||||
// Write all 1s to determine size
|
||||
ivars->pci->ConfigurationWrite32(offset, 0xFFFFFFFF);
|
||||
uint32_t readback = 0;
|
||||
ivars->pci->ConfigurationRead32(offset, &readback);
|
||||
|
||||
// Restore original
|
||||
ivars->pci->ConfigurationWrite32(offset, bar_origins[i]);
|
||||
|
||||
if (readback == 0 || readback == 0xFFFFFFFF) {
|
||||
bar_sizes[i] = 0;
|
||||
bar_64bit[i] = false;
|
||||
i++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Decode BAR type
|
||||
uint8_t bar_type = readback & 0x7;
|
||||
bar_64bit[i] = (bar_type == 0x4);
|
||||
|
||||
// Compute size: mask off type/info bits, invert, add 1
|
||||
uint32_t mask = readback & ~0xF;
|
||||
uint64_t size = ~(uint64_t)mask + 1;
|
||||
|
||||
if (bar_64bit[i] && i + 1 < 6) {
|
||||
// For 64-bit BARs, also probe upper 32 bits
|
||||
uint32_t offset_hi = kIOPCIConfigurationOffsetBaseAddress0 + (i + 1) * 4;
|
||||
uint32_t orig_hi = 0;
|
||||
ivars->pci->ConfigurationRead32(offset_hi, &orig_hi);
|
||||
|
||||
ivars->pci->ConfigurationWrite32(offset_hi, 0xFFFFFFFF);
|
||||
uint32_t readback_hi = 0;
|
||||
ivars->pci->ConfigurationRead32(offset_hi, &readback_hi);
|
||||
|
||||
ivars->pci->ConfigurationWrite32(offset_hi, orig_hi);
|
||||
|
||||
bar_origins[i + 1] = orig_hi;
|
||||
// 64-bit size = lower 32 inverted + upper 32 inverted << 32
|
||||
uint64_t size_hi = (~(uint64_t)readback_hi) << 32;
|
||||
bar_sizes[i] = (size & 0xFFFFFFFF) | size_hi;
|
||||
if (bar_sizes[i] == 0) bar_sizes[i] = ((uint64_t)1 << 32);
|
||||
bar_sizes[i + 1] = 0; // Upper half is part of the same BAR
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d: size=0x%llx 64bit=1", i, bar_sizes[i]);
|
||||
i += 2;
|
||||
} else {
|
||||
bar_sizes[i] = size;
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d: size=0x%llx 64bit=0", i, size);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 2: Read parent bridge memory aperture
|
||||
// Thunderbolt bridges expose memory windows at standard PCI bridge config offsets
|
||||
// Offset 0x20: Memory Base (16-bit, lower 28 bits of base address, 1MB aligned)
|
||||
// Offset 0x22: Memory Limit (16-bit, lower 28 bits of limit address)
|
||||
// Offset 0x24: Prefetchable Memory Base Lower (16-bit)
|
||||
// Offset 0x28: Prefetchable Memory Base Upper (32-bit)
|
||||
// Offset 0x2C: Prefetchable Memory Limit Upper (32-bit)
|
||||
|
||||
OSObject* parentObj = nullptr;
|
||||
ivars->pci->CopyParent(&parentObj);
|
||||
IOPCIDevice* bridge = OSDynamicCast(IOPCIDevice, parentObj);
|
||||
|
||||
uint64_t mem_base = 0, mem_limit = 0;
|
||||
if (bridge) {
|
||||
bridge->Open(this, 0);
|
||||
uint16_t mem_base_reg = 0, mem_limit_reg = 0;
|
||||
bridge->ConfigurationRead16(0x20, &mem_base_reg);
|
||||
bridge->ConfigurationRead16(0x22, &mem_limit_reg);
|
||||
|
||||
// Decode: Memory Base/Limit are in units of 1MB, aligned to 1MB boundaries
|
||||
mem_base = ((uint64_t)(mem_base_reg & 0xFFF0) << 16);
|
||||
mem_limit = (((uint64_t)(mem_limit_reg & 0xFFF0) << 16) | 0xFFFFF);
|
||||
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: bridge mem window 0x%llx - 0x%llx (base_reg=0x%04x limit_reg=0x%04x)", mem_base, mem_limit, mem_base_reg, mem_limit_reg);
|
||||
|
||||
// Also check prefetchable memory window for 64-bit BARs
|
||||
uint16_t pref_base_low = 0, pref_limit_low = 0;
|
||||
uint32_t pref_base_hi = 0, pref_limit_hi = 0;
|
||||
bridge->ConfigurationRead16(0x24, &pref_base_low);
|
||||
bridge->ConfigurationRead16(0x26, &pref_limit_low);
|
||||
bridge->ConfigurationRead32(0x28, &pref_base_hi);
|
||||
bridge->ConfigurationRead32(0x2C, &pref_limit_hi);
|
||||
|
||||
uint64_t pref_base = ((uint64_t)(pref_base_low & 0xFFF0) << 16) | ((uint64_t)pref_base_hi << 32);
|
||||
uint64_t pref_limit = (((uint64_t)(pref_limit_low & 0xFFF0) << 16) | 0xFFFFF) | ((uint64_t)pref_limit_hi << 32);
|
||||
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: bridge pref mem window 0x%llx - 0x%llx", pref_base, pref_limit);
|
||||
|
||||
// Use prefetchable window for 64-bit BARs if available
|
||||
if (pref_base != 0 && pref_limit > pref_base) {
|
||||
// Use non-prefetchable for 32-bit BARs, prefetchable for 64-bit BARs
|
||||
// For now, we allocate everything from the larger window
|
||||
if (pref_limit - pref_base > mem_limit - mem_base) {
|
||||
mem_base = pref_base;
|
||||
mem_limit = pref_limit;
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: using pref mem window for BAR allocation (larger)");
|
||||
}
|
||||
}
|
||||
|
||||
bridge->Close(this, 0);
|
||||
} else {
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: WARNING: parent is not IOPCIDevice, cannot determine bridge aperture");
|
||||
}
|
||||
|
||||
if (parentObj) parentObj->release();
|
||||
|
||||
// Step 3: Allocate and program BAR addresses
|
||||
if (mem_base == 0 && mem_limit == 0) {
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: WARNING: bridge memory window not available, cannot program BARs");
|
||||
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, cmd);
|
||||
return;
|
||||
}
|
||||
|
||||
uint64_t next_addr = mem_base;
|
||||
for (int i = 0; i < 6; ) {
|
||||
if (bar_sizes[i] == 0) { i++; continue; }
|
||||
|
||||
// Align to BAR size (PCI BARs must be naturally aligned)
|
||||
uint64_t alignment = bar_sizes[i];
|
||||
uint64_t addr = (next_addr + alignment - 1) & ~(alignment - 1);
|
||||
|
||||
if (addr + bar_sizes[i] > mem_limit) {
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: ERROR: not enough space for BAR%d (need 0x%llx at 0x%llx, limit 0x%llx)", i, bar_sizes[i], addr, mem_limit);
|
||||
break;
|
||||
}
|
||||
|
||||
uint32_t offset = kIOPCIConfigurationOffsetBaseAddress0 + i * 4;
|
||||
|
||||
if (bar_64bit[i]) {
|
||||
uint32_t bar_lo = (uint32_t)(addr & ~0xF) | 0x04; // 64-bit, prefetchable bit preserved
|
||||
uint32_t bar_hi = (uint32_t)(addr >> 32);
|
||||
ivars->pci->ConfigurationWrite32(offset, bar_lo);
|
||||
ivars->pci->ConfigurationWrite32(offset + 4, bar_hi);
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d (64-bit): addr=0x%llx size=0x%llx lo=0x%08x hi=0x%08x", i, addr, bar_sizes[i], bar_lo, bar_hi);
|
||||
next_addr = addr + bar_sizes[i];
|
||||
i += 2;
|
||||
} else {
|
||||
uint32_t bar_val = (uint32_t)(addr & ~0xF) | 0x00; // 32-bit, non-prefetchable
|
||||
ivars->pci->ConfigurationWrite32(offset, bar_val);
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d (32-bit): addr=0x%llx size=0x%llx val=0x%08x", i, addr, bar_sizes[i], bar_val);
|
||||
next_addr = addr + bar_sizes[i];
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: Re-enable Memory Space and Bus Master
|
||||
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, cmd);
|
||||
|
||||
// Verify BAR programming
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR verification:");
|
||||
for (int i = 0; i < 6; i++) {
|
||||
uint32_t offset = kIOPCIConfigurationOffsetBaseAddress0 + i * 4;
|
||||
uint32_t val = 0;
|
||||
ivars->pci->ConfigurationRead32(offset, &val);
|
||||
if (val != 0) {
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d = 0x%08x", i, val);
|
||||
}
|
||||
}
|
||||
|
||||
os_log(OS_LOG_DEFAULT, "tinygpu: BAR address programming complete");
|
||||
}
|
||||
|
||||
kern_return_t TinyGPUDriver::Start_Impl(IOService* in_provider)
|
||||
{
|
||||
IOServiceName service_name;
|
||||
|
|
@ -59,6 +262,11 @@ kern_return_t TinyGPUDriver::Start_Impl(IOService* in_provider)
|
|||
commandRegister |= (kIOPCICommandIOSpace | kIOPCICommandBusMaster | kIOPCICommandMemorySpace);
|
||||
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, commandRegister);
|
||||
|
||||
// Program BAR addresses for eGPUs where macOS doesn't assign them.
|
||||
// On Apple Silicon Thunderbolt, BAR registers may remain 0x00000000,
|
||||
// causing _CopyDeviceMemoryWithIndex to fail silently.
|
||||
ProgramBARAddresses();
|
||||
|
||||
memcpy((void*)service_name, (void*)"tinygpu\0", 8);
|
||||
SetName(service_name);
|
||||
|
||||
|
|
@ -195,4 +403,4 @@ kern_return_t TinyGPUDriver::ResetDevice()
|
|||
IOPCIDevice* TinyGPUDriver::GetPCI()
|
||||
{
|
||||
return ivars->pci;
|
||||
}
|
||||
}
|
||||
|
|
@ -35,6 +35,10 @@ public:
|
|||
kern_return_t CfgWrite(uint32_t off, uint32_t size, uint32_t val) LOCALONLY;
|
||||
kern_return_t ResetDevice() LOCALONLY;
|
||||
IOPCIDevice* GetPCI() LOCALONLY;
|
||||
|
||||
// Program BAR addresses for eGPUs where macOS doesn't assign them.
|
||||
// Called from Start_Impl after enabling Memory Space and Bus Master.
|
||||
void ProgramBARAddresses() LOCALONLY;
|
||||
};
|
||||
|
||||
#endif /* TinyGPUDriver_h */
|
||||
#endif /* TinyGPUDriver_h */
|
||||
Loading…
Add table
Add a link
Reference in a new issue