mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
hcq_smi: kill mac pids (#14398)
This commit is contained in:
parent
202b74b369
commit
544928766d
3 changed files with 21 additions and 25 deletions
24
.github/workflows/benchmark.yml
vendored
24
.github/workflows/benchmark.yml
vendored
|
|
@ -145,6 +145,10 @@ jobs:
|
|||
run: |
|
||||
echo "CACHEDB=/tmp/staging.db" >> $GITHUB_ENV
|
||||
rm -f /tmp/staging.db /tmp/staging.db-shm /tmp/staging.db-wal
|
||||
- name: Kill stale pids
|
||||
run: |
|
||||
PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids
|
||||
- name: UsbGPU boot time
|
||||
run: sudo -E PYTHONPATH=. DEBUG=2 AM_RESET=1 AMD=1 AMD_IFACE=USB time python3.11 test/test_tiny.py TestTiny.test_plus
|
||||
- name: UsbGPU tiny tests
|
||||
|
|
@ -332,9 +336,9 @@ jobs:
|
|||
- name: Setcap to python
|
||||
run: ./extra/amdpci/setup_python_cap.sh
|
||||
- name: Remove amd modules
|
||||
run: ./extra/hcq/hcq_smi.py amd rmmod
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
|
||||
- name: Kill stale pids
|
||||
run: ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
#- name: Insert amdgpu
|
||||
# run: sudo modprobe amdgpu
|
||||
- name: Symlink models and datasets
|
||||
|
|
@ -444,9 +448,9 @@ jobs:
|
|||
- name: Setcap to python
|
||||
run: ./extra/amdpci/setup_python_cap.sh
|
||||
- name: Remove amd modules
|
||||
run: ./extra/hcq/hcq_smi.py amd rmmod
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
|
||||
- name: Kill stale pids
|
||||
run: ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
- name: Symlink models and datasets
|
||||
run: |
|
||||
mkdir -p weights
|
||||
|
|
@ -496,9 +500,9 @@ jobs:
|
|||
- name: Setcap to python
|
||||
run: ./extra/amdpci/setup_python_cap.sh
|
||||
- name: Remove amd modules
|
||||
run: ./extra/hcq/hcq_smi.py amd rmmod
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
|
||||
- name: Kill stale pids
|
||||
run: ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
- name: Symlink models and datasets
|
||||
run: |
|
||||
mkdir -p weights
|
||||
|
|
@ -587,9 +591,9 @@ jobs:
|
|||
- name: Setcap to python
|
||||
run: ./extra/amdpci/setup_python_cap.sh
|
||||
- name: Remove amd modules
|
||||
run: ./extra/hcq/hcq_smi.py amd rmmod
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd rmmod
|
||||
- name: Kill stale pids
|
||||
run: ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py amd kill_pids
|
||||
- name: Symlink models and datasets
|
||||
run: |
|
||||
mkdir -p weights
|
||||
|
|
@ -651,9 +655,9 @@ jobs:
|
|||
- name: Setcap to python
|
||||
run: ./extra/amdpci/setup_python_cap.sh
|
||||
- name: Remove nv modules
|
||||
run: ./extra/hcq/hcq_smi.py nv rmmod
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv rmmod
|
||||
- name: Kill stale pids
|
||||
run: ./extra/hcq/hcq_smi.py nv kill_pids
|
||||
run: PYTHONPATH=. ./extra/hcq/hcq_smi.py nv kill_pids
|
||||
- name: Symlink models and datasets
|
||||
run: |
|
||||
mkdir -p weights
|
||||
|
|
|
|||
|
|
@ -1,14 +1,15 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import argparse, glob, os, time, subprocess, sys
|
||||
from tinygrad.helpers import temp
|
||||
|
||||
def scan_devs_based_on_lock(prefix:str, args) -> list[str]:
|
||||
target_dev = args.pci_bus if 'pci_bus' in args.__dir__() else ""
|
||||
|
||||
devs = []
|
||||
for dev in glob.glob(f'/tmp/{prefix}_*.lock'):
|
||||
dev_id = dev[8:-5]
|
||||
if os.path.exists(f"/sys/bus/pci/devices/{dev_id}") and dev_id.startswith(target_dev): devs.append(dev_id)
|
||||
for dev in glob.glob(temp(f'{prefix}_*.lock')):
|
||||
dev_id = dev.split('/')[-1][len(prefix)+1:-5]
|
||||
if dev_id.startswith(target_dev): devs.append(dev_id)
|
||||
return devs
|
||||
|
||||
def _do_reset_device(pci_bus): os.system(f"sudo sh -c 'echo 1 > /sys/bus/pci/devices/{pci_bus}/reset'")
|
||||
|
|
@ -53,16 +54,7 @@ def cmd_show_pids(args):
|
|||
|
||||
for dev in devs:
|
||||
try:
|
||||
pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1]
|
||||
print(f"{dev}: {pid}")
|
||||
except subprocess.CalledProcessError: print(f"{dev}: No processes found using this device")
|
||||
|
||||
def cmd_kill_pids(args):
|
||||
devs = scan_devs_based_on_lock(prefix:={"amd":"am", "nv":"nv"}[args.backend], args)
|
||||
|
||||
for dev in devs:
|
||||
try:
|
||||
pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1]
|
||||
pid = subprocess.check_output(['sudo', 'lsof', temp(f'{prefix}_{dev}.lock')]).decode('utf-8').strip().split('\n')[1].split()[1]
|
||||
print(f"{dev}: {pid}")
|
||||
except subprocess.CalledProcessError: print(f"{dev}: No processes found using this device")
|
||||
|
||||
|
|
@ -74,7 +66,7 @@ def cmd_kill_pids(args):
|
|||
if i > 0: time.sleep(0.2)
|
||||
|
||||
try:
|
||||
try: pid = subprocess.check_output(['sudo', 'lsof', f'/tmp/{prefix}_{dev}.lock']).decode('utf-8').strip().split('\n')[1].split()[1]
|
||||
try: pid = subprocess.check_output(['sudo', 'lsof', temp(f'{prefix}_{dev}.lock')]).decode('utf-8').strip().split('\n')[1].split()[1]
|
||||
except subprocess.CalledProcessError: break
|
||||
|
||||
print(f"Killing process {pid} (which uses {dev})")
|
||||
|
|
|
|||
|
|
@ -131,7 +131,7 @@ class _System:
|
|||
else: self.lock_fd = os.open(lock_name, os.O_RDWR | os.O_CREAT | os.O_CLOEXEC, 0o666)
|
||||
|
||||
try: fcntl.flock(self.lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
except OSError: raise RuntimeError(f"Failed to take lock file {name}. It's already in use.")
|
||||
except OSError: raise RuntimeError(f"Failed to acquire lock file {name}. `sudo lsof {lock_name}` may help identify the process holding the lock.")
|
||||
|
||||
return self.lock_fd
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue