hcq_smi: kill mac pids (#14398)

This commit is contained in:
nimlgen 2026-01-28 15:00:28 +03:00 committed by GitHub
commit 544928766d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 21 additions and 25 deletions

View file

@ -145,6 +145,10 @@ jobs:
run: |
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
- name: Kill stale pids
run: |
PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids
- name: UsbGPU boot time
run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
- name: UsbGPU tiny tests
@ -332,9 +336,9 @@ jobs:
- name: Setcap to python
run: ./extra/amdpci/setup_python_cap.sh
- name: Remove amd modules
run: ./extra/hcq/hcq_smi.py amd rmmod
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
- name: Kill stale pids
run: ./extra/hcq/hcq_smi.py amd kill_pids
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
#- name: Insert amdgpu
# run: sudo modprobe amdgpu
- name: Symlink models and datasets
@ -444,9 +448,9 @@ jobs:
- name: Setcap to python
run: ./extra/amdpci/setup_python_cap.sh
- name: Remove amd modules
run: ./extra/hcq/hcq_smi.py amd rmmod
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
- name: Kill stale pids
run: ./extra/hcq/hcq_smi.py amd kill_pids
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
- name: Symlink models and datasets
run: |
mkdir -p weights
@ -496,9 +500,9 @@ jobs:
- name: Setcap to python
run: ./extra/amdpci/setup_python_cap.sh
- name: Remove amd modules
run: ./extra/hcq/hcq_smi.py amd rmmod
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
- name: Kill stale pids
run: ./extra/hcq/hcq_smi.py amd kill_pids
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
- name: Symlink models and datasets
run: |
mkdir -p weights
@ -587,9 +591,9 @@ jobs:
- name: Setcap to python
run: ./extra/amdpci/setup_python_cap.sh
- name: Remove amd modules
run: ./extra/hcq/hcq_smi.py amd rmmod
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
- name: Kill stale pids
run: ./extra/hcq/hcq_smi.py amd kill_pids
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
- name: Symlink models and datasets
run: |
mkdir -p weights
@ -651,9 +655,9 @@ jobs:
- name: Setcap to python
run: ./extra/amdpci/setup_python_cap.sh
- name: Remove nv modules
run: ./extra/hcq/hcq_smi.py nv rmmod
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv rmmod
- name: Kill stale pids
run: ./extra/hcq/hcq_smi.py nv kill_pids
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids
- name: Symlink models and datasets
run: |
mkdir -p weights

View file

@ -1,14 +1,15 @@
#!/usr/bin/env python3
import argparse, glob, os, time, subprocess, sys
from tinygrad.helpers import temp
def scan_devs_based_on_lock(prefix:str, args) -> list[str]:
target_dev = args.pci_bus if 'pci_bus' in args.__dir__() else ""
devs = []
for dev in glob.glob(f'/tmp/{prefix}_*.lock'):
dev_id = dev[8:-5]
if os.path.exists(f"/sys/bus/pci/devices/{dev_id}") and dev_id.startswith(target_dev): devs.append(dev_id)
for dev in glob.glob(temp(f'{prefix}_*.lock')):
dev_id = dev.split('/')[-1][len(prefix)+1:-5]
if dev_id.startswith(target_dev): devs.append(dev_id)
return devs
def _do_reset_device(pci_bus): os.system(f"sudo sh -c 'echo 1 > /sys/bus/pci/devices/{pci_bus}/reset'")
@ -53,16 +54,7 @@ def cmd_show_pids(args):
for dev in devs:
try:
pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1]
print(f"{dev}: {pid}")
except subprocess.CalledProcessError: print(f"{dev}: No processes found using this device")
def cmd_kill_pids(args):
devs = scan_devs_based_on_lock(prefix:={"amd":"am", "nv":"nv"}[args.backend], args)
for dev in devs:
try:
pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1]
pid = subprocess.check_output(['sudo', 'lsof', temp(f'{prefix}_{dev}.lock')]).decode('utf-8').strip().split('\n')[1].split()[1]
print(f"{dev}: {pid}")
except subprocess.CalledProcessError: print(f"{dev}: No processes found using this device")
@ -74,7 +66,7 @@ def cmd_kill_pids(args):
if i > 0: time.sleep(0.2)
try:
try: pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1]
try: pid = subprocess.check_output(['sudo', 'lsof', temp(f'{prefix}_{dev}.lock')]).decode('utf-8').strip().split('\n')[1].split()[1]
except subprocess.CalledProcessError: break
print(f"Killing process {pid} (which uses {dev})")

View file

@ -131,7 +131,7 @@ class _System:
else: self.lock_fd = os.open(lock_name, os.O_RDWR | os.O_CREAT | os.O_CLOEXEC, 0o666)
try: fcntl.flock(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
except OSError: raise RuntimeError(f"Failed to take lock file {name}. It's already in use.")
except OSError: raise RuntimeError(f"Failed to acquire lock file {name}. `sudo lsof {lock_name}` may help identify the process holding the lock.")
return self.lock_fd