fix(egpu): program BAR addresses for Thunderbolt eGPUs on Apple Silicon

On macOS Apple Silicon, Thunderbolt-connected eGPUs don't get BAR addresses
assigned by the OS. The BAR registers in PCI config space remain 0x00000000,
causing _CopyDeviceMemoryWithIndex to return mappings that read as 0xFFFFFFFF
(unmapped memory), making the GPU unusable for compute.

This patch adds ProgramBARAddresses() to TinyGPUDriver::Start_Impl, which:
1. Checks if BAR0 is unassigned (0x00000000 or 0xFFFFFFFF)
2. Determines each BAR's size requirement by writing/reading 0xFFFFFFFF
3. Reads the parent Thunderbolt bridge's memory aperture (MEM base/limit,
   prefetchable MEM base/limit) from PCI config space
4. Allocates naturally-aligned BAR addresses from the bridge aperture
5. Writes the addresses to BAR registers in config space
6. Re-enables Memory Space and Bus Master in the command register
7. Logs all steps for debugging

This mirrors what Linux does in pci_assign_resource() and what mac-amdgpu
does manually for the R9700. Without this, the DriverKit extension can
enumerate the GPU but cannot access its MMIO registers.

Tested-on: AMD Radeon AI PRO R9700 32GB (RDNA4) on M4 Mac Mini via
ACASIS G4Pro Thunderbolt 4 enclosure.

Fixes: #15813, #15864, #16714
Related: #15730, #15744
This commit is contained in:
Ember 2026-06-23 20:59:13 +02:00
commit ec93db06db
2 changed files with 214 additions and 2 deletions

View file

@ -31,6 +31,209 @@ void TinyGPUDriver::free()
super::free();
}
void TinyGPUDriver::ProgramBARAddresses()
{
// On Apple Silicon, macOS may not assign BAR addresses for Thunderbolt eGPUs.
// The BAR registers in PCI config space remain 0x00000000, causing
// _CopyDeviceMemoryWithIndex to return mappings that read as 0xFFFFFFFF.
//
// This function checks if BARs are unassigned and programs them by:
// 1. Determining each BAR's size requirement (write 0xFFFFFFFF, read back)
// 2. Reading the parent Thunderbolt bridge's memory aperture
// 3. Allocating BAR addresses from the aperture
// 4. Writing the addresses to BAR registers
// 5. Re-enabling Memory Space
os_log(OS_LOG_DEFAULT, "tinygpu: checking BAR assignments");
// Check if BAR0 is already assigned by the OS
uint32_t bar0 = 0;
ivars->pci->ConfigurationRead32(kIOPCIConfigurationOffsetBaseAddress0, &bar0);
if (bar0 != 0 && bar0 != 0xFFFFFFFF) {
os_log(OS_LOG_DEFAULT, "tinygpu: BAR0 already assigned (0x%08x), skipping BAR programming", bar0);
return;
}
os_log(OS_LOG_DEFAULT, "tinygpu: BAR0 unassigned (0x%08x), programming BAR addresses for eGPU", bar0);
// Disable memory and I/O access while programming BARs
uint16_t cmd = 0;
ivars->pci->ConfigurationRead16(kIOPCIConfigurationOffsetCommand, &cmd);
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, cmd & ~(kIOPCICommandIOSpace | kIOPCICommandMemorySpace));
// Step 1: Determine BAR sizes
uint64_t bar_sizes[6] = {};
bool bar_64bit[6] = {};
uint32_t bar_origins[6] = {};
for (int i = 0; i < 6; ) {
uint32_t offset = kIOPCIConfigurationOffsetBaseAddress0 + i * 4;
// Save original
ivars->pci->ConfigurationRead32(offset, &bar_origins[i]);
// Write all 1s to determine size
ivars->pci->ConfigurationWrite32(offset, 0xFFFFFFFF);
uint32_t readback = 0;
ivars->pci->ConfigurationRead32(offset, &readback);
// Restore original
ivars->pci->ConfigurationWrite32(offset, bar_origins[i]);
if (readback == 0 || readback == 0xFFFFFFFF) {
bar_sizes[i] = 0;
bar_64bit[i] = false;
i++;
continue;
}
// Decode BAR type
uint8_t bar_type = readback & 0x7;
bar_64bit[i] = (bar_type == 0x4);
// Compute size: mask off type/info bits, invert, add 1
uint32_t mask = readback & ~0xF;
uint64_t size = ~(uint64_t)mask + 1;
if (bar_64bit[i] && i + 1 < 6) {
// For 64-bit BARs, also probe upper 32 bits
uint32_t offset_hi = kIOPCIConfigurationOffsetBaseAddress0 + (i + 1) * 4;
uint32_t orig_hi = 0;
ivars->pci->ConfigurationRead32(offset_hi, &orig_hi);
ivars->pci->ConfigurationWrite32(offset_hi, 0xFFFFFFFF);
uint32_t readback_hi = 0;
ivars->pci->ConfigurationRead32(offset_hi, &readback_hi);
ivars->pci->ConfigurationWrite32(offset_hi, orig_hi);
bar_origins[i + 1] = orig_hi;
// 64-bit size = lower 32 inverted + upper 32 inverted << 32
uint64_t size_hi = (~(uint64_t)readback_hi) << 32;
bar_sizes[i] = (size & 0xFFFFFFFF) | size_hi;
if (bar_sizes[i] == 0) bar_sizes[i] = ((uint64_t)1 << 32);
bar_sizes[i + 1] = 0; // Upper half is part of the same BAR
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d: size=0x%llx 64bit=1", i, bar_sizes[i]);
i += 2;
} else {
bar_sizes[i] = size;
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d: size=0x%llx 64bit=0", i, size);
i++;
}
}
// Step 2: Read parent bridge memory aperture
// Thunderbolt bridges expose memory windows at standard PCI bridge config offsets
// Offset 0x20: Memory Base (16-bit, lower 28 bits of base address, 1MB aligned)
// Offset 0x22: Memory Limit (16-bit, lower 28 bits of limit address)
// Offset 0x24: Prefetchable Memory Base Lower (16-bit)
// Offset 0x28: Prefetchable Memory Base Upper (32-bit)
// Offset 0x2C: Prefetchable Memory Limit Upper (32-bit)
OSObject* parentObj = nullptr;
ivars->pci->CopyParent(&parentObj);
IOPCIDevice* bridge = OSDynamicCast(IOPCIDevice, parentObj);
uint64_t mem_base = 0, mem_limit = 0;
if (bridge) {
bridge->Open(this, 0);
uint16_t mem_base_reg = 0, mem_limit_reg = 0;
bridge->ConfigurationRead16(0x20, &mem_base_reg);
bridge->ConfigurationRead16(0x22, &mem_limit_reg);
// Decode: Memory Base/Limit are in units of 1MB, aligned to 1MB boundaries
mem_base = ((uint64_t)(mem_base_reg & 0xFFF0) << 16);
mem_limit = (((uint64_t)(mem_limit_reg & 0xFFF0) << 16) | 0xFFFFF);
os_log(OS_LOG_DEFAULT, "tinygpu: bridge mem window 0x%llx - 0x%llx (base_reg=0x%04x limit_reg=0x%04x)", mem_base, mem_limit, mem_base_reg, mem_limit_reg);
// Also check prefetchable memory window for 64-bit BARs
uint16_t pref_base_low = 0, pref_limit_low = 0;
uint32_t pref_base_hi = 0, pref_limit_hi = 0;
bridge->ConfigurationRead16(0x24, &pref_base_low);
bridge->ConfigurationRead16(0x26, &pref_limit_low);
bridge->ConfigurationRead32(0x28, &pref_base_hi);
bridge->ConfigurationRead32(0x2C, &pref_limit_hi);
uint64_t pref_base = ((uint64_t)(pref_base_low & 0xFFF0) << 16) | ((uint64_t)pref_base_hi << 32);
uint64_t pref_limit = (((uint64_t)(pref_limit_low & 0xFFF0) << 16) | 0xFFFFF) | ((uint64_t)pref_limit_hi << 32);
os_log(OS_LOG_DEFAULT, "tinygpu: bridge pref mem window 0x%llx - 0x%llx", pref_base, pref_limit);
// Use prefetchable window for 64-bit BARs if available
if (pref_base != 0 && pref_limit > pref_base) {
// Use non-prefetchable for 32-bit BARs, prefetchable for 64-bit BARs
// For now, we allocate everything from the larger window
if (pref_limit - pref_base > mem_limit - mem_base) {
mem_base = pref_base;
mem_limit = pref_limit;
os_log(OS_LOG_DEFAULT, "tinygpu: using pref mem window for BAR allocation (larger)");
}
}
bridge->Close(this, 0);
} else {
os_log(OS_LOG_DEFAULT, "tinygpu: WARNING: parent is not IOPCIDevice, cannot determine bridge aperture");
}
if (parentObj) parentObj->release();
// Step 3: Allocate and program BAR addresses
if (mem_base == 0 && mem_limit == 0) {
os_log(OS_LOG_DEFAULT, "tinygpu: WARNING: bridge memory window not available, cannot program BARs");
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, cmd);
return;
}
uint64_t next_addr = mem_base;
for (int i = 0; i < 6; ) {
if (bar_sizes[i] == 0) { i++; continue; }
// Align to BAR size (PCI BARs must be naturally aligned)
uint64_t alignment = bar_sizes[i];
uint64_t addr = (next_addr + alignment - 1) & ~(alignment - 1);
if (addr + bar_sizes[i] > mem_limit) {
os_log(OS_LOG_DEFAULT, "tinygpu: ERROR: not enough space for BAR%d (need 0x%llx at 0x%llx, limit 0x%llx)", i, bar_sizes[i], addr, mem_limit);
break;
}
uint32_t offset = kIOPCIConfigurationOffsetBaseAddress0 + i * 4;
if (bar_64bit[i]) {
uint32_t bar_lo = (uint32_t)(addr & ~0xF) | 0x04; // 64-bit, prefetchable bit preserved
uint32_t bar_hi = (uint32_t)(addr >> 32);
ivars->pci->ConfigurationWrite32(offset, bar_lo);
ivars->pci->ConfigurationWrite32(offset + 4, bar_hi);
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d (64-bit): addr=0x%llx size=0x%llx lo=0x%08x hi=0x%08x", i, addr, bar_sizes[i], bar_lo, bar_hi);
next_addr = addr + bar_sizes[i];
i += 2;
} else {
uint32_t bar_val = (uint32_t)(addr & ~0xF) | 0x00; // 32-bit, non-prefetchable
ivars->pci->ConfigurationWrite32(offset, bar_val);
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d (32-bit): addr=0x%llx size=0x%llx val=0x%08x", i, addr, bar_sizes[i], bar_val);
next_addr = addr + bar_sizes[i];
i++;
}
}
// Step 4: Re-enable Memory Space and Bus Master
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, cmd);
// Verify BAR programming
os_log(OS_LOG_DEFAULT, "tinygpu: BAR verification:");
for (int i = 0; i < 6; i++) {
uint32_t offset = kIOPCIConfigurationOffsetBaseAddress0 + i * 4;
uint32_t val = 0;
ivars->pci->ConfigurationRead32(offset, &val);
if (val != 0) {
os_log(OS_LOG_DEFAULT, "tinygpu: BAR%d = 0x%08x", i, val);
}
}
os_log(OS_LOG_DEFAULT, "tinygpu: BAR address programming complete");
}
kern_return_t TinyGPUDriver::Start_Impl(IOService* in_provider)
{
IOServiceName service_name;
@ -59,6 +262,11 @@ kern_return_t TinyGPUDriver::Start_Impl(IOService* in_provider)
commandRegister |= (kIOPCICommandIOSpace | kIOPCICommandBusMaster | kIOPCICommandMemorySpace);
ivars->pci->ConfigurationWrite16(kIOPCIConfigurationOffsetCommand, commandRegister);
// Program BAR addresses for eGPUs where macOS doesn't assign them.
// On Apple Silicon Thunderbolt, BAR registers may remain 0x00000000,
// causing _CopyDeviceMemoryWithIndex to fail silently.
ProgramBARAddresses();
memcpy((void*)service_name, (void*)"tinygpu\0", 8);
SetName(service_name);
@ -195,4 +403,4 @@ kern_return_t TinyGPUDriver::ResetDevice()
IOPCIDevice* TinyGPUDriver::GetPCI()
{
return ivars->pci;
}
}

View file

@ -35,6 +35,10 @@ public:
kern_return_t CfgWrite(uint32_t off, uint32_t size, uint32_t val) LOCALONLY;
kern_return_t ResetDevice() LOCALONLY;
IOPCIDevice* GetPCI() LOCALONLY;
// Program BAR addresses for eGPUs where macOS doesn't assign them.
// Called from Start_Impl after enabling Memory Space and Bus Master.
void ProgramBARAddresses() LOCALONLY;
};
#endif /* TinyGPUDriver_h */
#endif /* TinyGPUDriver_h */