In-tree autogen: all C libraries (#13220)

* checkout files from autogen branch

* ioctl with payload

* fix am generations

* properly fix generations

This reverts commit b2a54f4f41.

* revert discovery.h

* support pragma pack(1)

* typo

* better getter

* typo

* NVCEC0_QMDV05_00_RELEASE[01]_ENABLE

* align support

* anon handling fix

---------

Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
Christopher Milan 2025-11-13 21:57:44 -05:00 committed by GitHub
commit 09f3aae169
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
67 changed files with 108566 additions and 205710 deletions

View file

@ -19,7 +19,7 @@ on:
jobs:
autogen:
name: Autogen
name: In-tree Autogen
runs-on: ubuntu-24.04
timeout-minutes: 15
steps:
@ -31,66 +31,107 @@ jobs:
opencl: 'true'
amd: 'true'
cuda: 'true'
webgpu: 'true'
llvm: 'true'
pydeps: 'pyyaml mako'
webgpu: 'true'
mesa: 'true'
pydeps: 'clang>=20 pyyaml mako'
- name: Install autogen support packages
run: sudo apt-get install -y --no-install-recommends llvm-14-dev libclang-14-dev llvm-20-dev
run: sudo apt-get install -y --no-install-recommends libclang-20-dev llvm-20-dev hip-dev libusb-1.0-0-dev
- name: Verify OpenCL autogen
run: |
cp tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak
./autogen_stubs.sh opencl
mv tinygrad/runtime/autogen/opencl.py /tmp/opencl.py.bak
python3 -c "from tinygrad.runtime.autogen import opencl"
diff /tmp/opencl.py.bak tinygrad/runtime/autogen/opencl.py
- name: Verify CUDA autogen
run: |
cp tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak
cp tinygrad/runtime/autogen/nv_gpu.py /tmp/nv_gpu.py.bak
./autogen_stubs.sh cuda
./autogen_stubs.sh nv
mv tinygrad/runtime/autogen/cuda.py /tmp/cuda.py.bak
mv tinygrad/runtime/autogen/nvrtc.py /tmp/nvrtc.py.bak
mv tinygrad/runtime/autogen/nvjitlink.py /tmp/nvjitlink.py.bak
mv tinygrad/runtime/autogen/nv_gpu.py /tmp/nv_gpu.py.bak
mv tinygrad/runtime/autogen/nv.py /tmp/nv.py.bak
python3 -c "from tinygrad.runtime.autogen import cuda, nvrtc, nvjitlink, nv_gpu, nv"
diff /tmp/cuda.py.bak tinygrad/runtime/autogen/cuda.py
diff /tmp/nvrtc.py.bak tinygrad/runtime/autogen/nvrtc.py
diff /tmp/nvjitlink.py.bak tinygrad/runtime/autogen/nvjitlink.py
diff /tmp/nv_gpu.py.bak tinygrad/runtime/autogen/nv_gpu.py
diff /tmp/nv.py.bak tinygrad/runtime/autogen/nv.py
- name: Verify AMD autogen
run: |
cp tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak
cp tinygrad/runtime/autogen/kfd.py /tmp/kfd.py.bak
cp tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
cp tinygrad/runtime/autogen/amd_gpu.py /tmp/amd_gpu.py.bak
cp tinygrad/runtime/autogen/sqtt.py /tmp/sqtt.py.bak
./autogen_stubs.sh hsa
./autogen_stubs.sh kfd
./autogen_stubs.sh comgr
./autogen_stubs.sh amd
./autogen_stubs.sh sqtt
diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py
diff /tmp/kfd.py.bak tinygrad/runtime/autogen/kfd.py
mv tinygrad/runtime/autogen/comgr.py /tmp/comgr.py.bak
mv tinygrad/runtime/autogen/hsa.py /tmp/hsa.py.bak
mv tinygrad/runtime/autogen/hip.py /tmp/hip.py.bak
mv tinygrad/runtime/autogen/amd_gpu.py /tmp/amd_gpu.py.bak
mv tinygrad/runtime/autogen/sqtt.py /tmp/sqtt.py.bak
mv tinygrad/runtime/autogen/rocprof.py /tmp/rocprof.py.bak
mv tinygrad/runtime/autogen/am/am.py /tmp/am_am.py.bak
mv tinygrad/runtime/autogen/am/pm4_soc15.py /tmp/am_pm4_soc15.py.bak
mv tinygrad/runtime/autogen/am/pm4_nv.py /tmp/am_pm4_nv.py.bak
mv tinygrad/runtime/autogen/am/sdma_4_0_0.py /tmp/am_sdma_4_0_0.py.bak
mv tinygrad/runtime/autogen/am/sdma_5_0_0.py /tmp/am_sdma_5_0_0.py.bak
mv tinygrad/runtime/autogen/am/sdma_6_0_0.py /tmp/am_sdma_6_0_0.py.bak
mv tinygrad/runtime/autogen/am/smu_v13_0_0.py /tmp/am_smu_v13_0_0.py.bak
mv tinygrad/runtime/autogen/am/smu_v14_0_2.py /tmp/am_smu_v14_0_2.py.bak
python3 -c "from tinygrad.runtime.autogen import comgr, hsa, hip, amd_gpu, sqtt, rocprof; from tinygrad.runtime.autogen.am import am, pm4_soc15, pm4_nv, sdma_4_0_0, sdma_5_0_0, sdma_6_0_0, smu_v13_0_0, smu_v14_0_2"
diff /tmp/comgr.py.bak tinygrad/runtime/autogen/comgr.py
diff /tmp/hsa.py.bak tinygrad/runtime/autogen/hsa.py
diff /tmp/hip.py.bak tinygrad/runtime/autogen/hip.py
diff /tmp/amd_gpu.py.bak tinygrad/runtime/autogen/amd_gpu.py
diff /tmp/sqtt.py.bak tinygrad/runtime/autogen/sqtt.py
diff /tmp/rocprof.py.bak tinygrad/runtime/autogen/rocprof.py
diff /tmp/am_am.py.bak tinygrad/runtime/autogen/am/am.py
diff /tmp/am_pm4_soc15.py.bak tinygrad/runtime/autogen/am/pm4_soc15.py
diff /tmp/am_pm4_nv.py.bak tinygrad/runtime/autogen/am/pm4_nv.py
diff /tmp/am_sdma_4_0_0.py.bak tinygrad/runtime/autogen/am/sdma_4_0_0.py
diff /tmp/am_sdma_5_0_0.py.bak tinygrad/runtime/autogen/am/sdma_5_0_0.py
diff /tmp/am_sdma_6_0_0.py.bak tinygrad/runtime/autogen/am/sdma_6_0_0.py
diff /tmp/am_smu_v13_0_0.py.bak tinygrad/runtime/autogen/am/smu_v13_0_0.py
diff /tmp/am_smu_v14_0_2.py.bak tinygrad/runtime/autogen/am/smu_v14_0_2.py
- name: Verify Linux autogen
run: |
cp tinygrad/runtime/autogen/io_uring.py /tmp/io_uring.py.bak
cp tinygrad/runtime/autogen/ib.py /tmp/ib.py.bak
./autogen_stubs.sh io_uring
./autogen_stubs.sh ib
mv tinygrad/runtime/autogen/libc.py /tmp/libc.py.bak
mv tinygrad/runtime/autogen/kfd.py /tmp/kfd.py.bak
mv tinygrad/runtime/autogen/io_uring.py /tmp/io_uring.py.bak
mv tinygrad/runtime/autogen/ib.py /tmp/ib.py.bak
mv tinygrad/runtime/autogen/pci.py /tmp/pci.py.bak
mv tinygrad/runtime/autogen/vfio.py /tmp/vfio.py.bak
python3 -c "from tinygrad.runtime.autogen import libc, kfd, io_uring, ib, pci, vfio"
diff /tmp/libc.py.bak tinygrad/runtime/autogen/libc.py
diff /tmp/kfd.py.bak tinygrad/runtime/autogen/kfd.py
diff /tmp/io_uring.py.bak tinygrad/runtime/autogen/io_uring.py
diff /tmp/ib.py.bak tinygrad/runtime/autogen/ib.py
- name: Verify WebGPU autogen
run: |
cp tinygrad/runtime/autogen/webgpu.py /tmp/webgpu.py.bak
./autogen_stubs.sh webgpu
diff /tmp/webgpu.py.bak tinygrad/runtime/autogen/webgpu.py
diff /tmp/pci.py.bak tinygrad/runtime/autogen/pci.py
diff /tmp/vfio.py.bak tinygrad/runtime/autogen/vfio.py
- name: Verify LLVM autogen
run: |
cp tinygrad/runtime/autogen/llvm.py /tmp/llvm.py.bak
./autogen_stubs.sh llvm
mv tinygrad/runtime/autogen/llvm.py /tmp/llvm.py.bak
python3 -c "from tinygrad.runtime.autogen import llvm"
diff /tmp/llvm.py.bak tinygrad/runtime/autogen/llvm.py
- name: Verify WebGPU autogen
run: |
mv tinygrad/runtime/autogen/webgpu.py /tmp/webgpu.py.bak
python3 -c "from tinygrad.runtime.autogen import webgpu"
diff /tmp/webgpu.py.bak tinygrad/runtime/autogen/webgpu.py
- name: Verify Qualcomm autogen
run: |
mv tinygrad/runtime/autogen/kgsl.py /tmp/kgsl.py.bak
mv tinygrad/runtime/autogen/adreno.py /tmp/adreno.py.bak
mv tinygrad/runtime/autogen/qcom_dsp.py /tmp/qcom_dsp.py.bak
python3 -c "from tinygrad.runtime.autogen import kgsl, adreno, qcom_dsp"
diff /tmp/kgsl.py.bak tinygrad/runtime/autogen/kgsl.py
diff /tmp/adreno.py.bak tinygrad/runtime/autogen/adreno.py
diff /tmp/qcom_dsp.py.bak tinygrad/runtime/autogen/qcom_dsp.py
- name: Verify libusb autogen
run: |
mv tinygrad/runtime/autogen/libusb.py /tmp/libusb.py.bak
python3 -c "from tinygrad.runtime.autogen import libusb"
diff /tmp/libusb.py.bak tinygrad/runtime/autogen/libusb.py
- name: Verify mesa autogen
run: |
cp tinygrad/runtime/autogen/mesa.py /tmp/mesa.py.bak
./autogen_stubs.sh mesa
mv tinygrad/runtime/autogen/mesa.py /tmp/mesa.py.bak
python3 -c "from tinygrad.runtime.autogen import mesa"
diff /tmp/mesa.py.bak tinygrad/runtime/autogen/mesa.py
autogen-ng:
name: In-tree Autogen
autogen-comgr-3:
name: In-tree Autogen (comgr 3)
runs-on: ubuntu-24.04
timeout-minutes: 15
steps:
@ -101,9 +142,16 @@ jobs:
with:
pydeps: 'clang>=20'
- name: Install autogen support packages
run: sudo apt-get install -y --no-install-recommends libclang-20-dev
- name: Verify Linux autogen
run: |
mv tinygrad/runtime/autogen/libc.py /tmp/libc.py.bak
python3 -c "from tinygrad.runtime.autogen import libc"
diff /tmp/libc.py.bak tinygrad/runtime/autogen/libc.py
wget https://repo.radeon.com/rocm/rocm.gpg.key -O - | gpg --dearmor | sudo tee /etc/apt/keyrings/rocm.gpg > /dev/null
sudo tee /etc/apt/sources.list.d/rocm.list <<EOF
deb [arch=amd64 signed-by=/etc/apt/keyrings/rocm.gpg] https://repo.radeon.com/rocm/apt/6.4 $(lsb_release -cs) main
EOF
echo -e 'Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' | sudo tee /etc/apt/preferences.d/rocm-pin-600
sudo apt -qq update || true
sudo apt-get install -y --no-install-recommends libclang-20-dev comgr
- name: Verify comgr (3) autogen
run: |
mv tinygrad/runtime/autogen/comgr_3.py /tmp/comgr_3.py.bak
python3 -c "from tinygrad.runtime.autogen import comgr"
diff /tmp/comgr_3.py.bak tinygrad/runtime/autogen/comgr_3.py

View file

@ -230,7 +230,7 @@ jobs:
python-version: '3.11'
deps: linting
- name: Lint bad-indentation and trailing-whitespace with pylint
run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string=' ' --recursive=y . --ignore-paths='tinygrad/runtime/autogen'
run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string=' ' --recursive=y .
- name: Lint with ruff
run: |
pip3 install --upgrade --force-reinstall ruff==0.11.0

View file

@ -1,550 +0,0 @@
#!/bin/bash -e
# setup instructions for clang2py
if [[ ! $(clang2py -V) ]]; then
pushd .
cd /tmp
sudo apt-get install -y --no-install-recommends clang
pip install --upgrade pip setuptools
pip install clang==14.0.6
git clone https://github.com/nimlgen/ctypeslib.git
cd ctypeslib
pip install .
clang2py -V
popd
fi
BASE=tinygrad/runtime/autogen/
fixup() {
sed -i '1s/^/# mypy: ignore-errors\n/' $1
sed -i 's/ *$//' $1
grep FIXME_STUB $1 || true
}
patch_dlopen() {
path=$1; shift
name=$1; shift
cat <<EOF | sed -i "/import ctypes.*/r /dev/stdin" $path
PATHS_TO_TRY = [
$(for p in "$@"; do echo " $p,"; done)
]
def _try_dlopen_$name():
library = ctypes.util.find_library("$name")
if library:
try: return ctypes.CDLL(library)
except OSError: pass
for candidate in PATHS_TO_TRY:
try: return ctypes.CDLL(candidate)
except OSError: pass
return None
EOF
}
generate_opencl() {
clang2py /usr/include/CL/cl.h -o $BASE/opencl.py -l /usr/lib/x86_64-linux-gnu/libOpenCL.so.1 -k cdefstum
fixup $BASE/opencl.py
# hot patches
sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/opencl.py
sed -i "s\ctypes.CDLL('/usr/lib/x86_64-linux-gnu/libOpenCL.so.1')\ctypes.CDLL(ctypes.util.find_library('OpenCL'))\g" $BASE/opencl.py
python3 -c "import tinygrad.runtime.autogen.opencl"
}
generate_hip() {
clang2py /opt/rocm/include/hip/hip_ext.h /opt/rocm/include/hip/hiprtc.h \
/opt/rocm/include/hip/hip_runtime_api.h /opt/rocm/include/hip/driver_types.h \
--clang-args="-D__HIP_PLATFORM_AMD__ -I/opt/rocm/include -x c++" -o $BASE/hip.py -l /opt/rocm/lib/libamdhip64.so
echo "hipDeviceProp_t = hipDeviceProp_tR0600" >> $BASE/hip.py
echo "hipGetDeviceProperties = hipGetDevicePropertiesR0600" >> $BASE/hip.py
fixup $BASE/hip.py
# we can trust HIP is always at /opt/rocm/lib
#sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/hip.py
#sed -i "s\ctypes.CDLL('/opt/rocm/lib/libhiprtc.so')\ctypes.CDLL(ctypes.util.find_library('hiprtc'))\g" $BASE/hip.py
#sed -i "s\ctypes.CDLL('/opt/rocm/lib/libamdhip64.so')\ctypes.CDLL(ctypes.util.find_library('amdhip64'))\g" $BASE/hip.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/hip.py
sed -i "s\'/opt/rocm/\os.getenv('ROCM_PATH', '/opt/rocm/')+'/\g" $BASE/hip.py
python3 -c "import tinygrad.runtime.autogen.hip"
}
generate_comgr() {
clang2py /opt/rocm/include/amd_comgr/amd_comgr.h \
--clang-args="-D__HIP_PLATFORM_AMD__ -I/opt/rocm/include -x c++" -o $BASE/comgr.py -l /opt/rocm/lib/libamd_comgr.so
fixup $BASE/comgr.py
sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/comgr.py
patch_dlopen $BASE/comgr.py amd_comgr "'/opt/rocm/lib/libamd_comgr.so'" "os.getenv('ROCM_PATH', '')+'/lib/libamd_comgr.so'" "'/usr/local/lib/libamd_comgr.dylib'" "'/opt/homebrew/lib/libamd_comgr.dylib'"
sed -i "s\ctypes.CDLL('/opt/rocm/lib/libamd_comgr.so')\_try_dlopen_amd_comgr()\g" $BASE/comgr.py
python3 -c "import tinygrad.runtime.autogen.comgr"
}
generate_kfd() {
clang2py /usr/include/linux/kfd_ioctl.h -o $BASE/kfd.py -k cdefstum
fixup $BASE/kfd.py
sed -i "s/import ctypes/import ctypes, os/g" $BASE/kfd.py
sed -i "s/import fcntl, functools/import functools/g" $BASE/kfd.py
sed -i "/import functools/a from tinygrad.runtime.support.hcq import FileIOInterface" $BASE/kfd.py
sed -i "s/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd, \*\*kwargs):/def _do_ioctl(__idir, __base, __nr, __user_struct, __fd:FileIOInterface, \*\*kwargs):/g" $BASE/kfd.py
sed -i "s/fcntl.ioctl(__fd, (__idir<<30)/__fd.ioctl((__idir<<30)/g" $BASE/kfd.py
sed -i "s/!!/not not /g" $BASE/kfd.py
python3 -c "import tinygrad.runtime.autogen.kfd"
}
generate_cuda() {
clang2py /usr/include/cuda.h --clang-args="-D__CUDA_API_VERSION_INTERNAL" -o $BASE/cuda.py -l /usr/lib/x86_64-linux-gnu/libcuda.so
sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/cuda.py
sed -i "s\ctypes.CDLL('/usr/lib/x86_64-linux-gnu/libcuda.so')\ctypes.CDLL(ctypes.util.find_library('cuda'))\g" $BASE/cuda.py
fixup $BASE/cuda.py
python3 -c "import tinygrad.runtime.autogen.cuda"
}
generate_nvrtc() {
clang2py /usr/local/cuda/include/nvrtc.h /usr/local/cuda/include/nvJitLink.h -o $BASE/nvrtc.py -l /usr/local/cuda/lib64/libnvrtc.so -l /usr/local/cuda/lib64/libnvJitLink.so
sed -i "s\import ctypes\import ctypes, ctypes.util\g" $BASE/nvrtc.py
sed -i "s\ctypes.CDLL('/usr/local/cuda/lib64/libnvrtc.so')\ctypes.CDLL(ctypes.util.find_library('nvrtc'))\g" $BASE/nvrtc.py
sed -i "s\ctypes.CDLL('/usr/local/cuda/lib64/libnvJitLink.so')\ctypes.CDLL(ctypes.util.find_library('nvJitLink'))\g" $BASE/nvrtc.py
fixup $BASE/nvrtc.py
python3 -c "import tinygrad.runtime.autogen.nvrtc"
}
generate_nv() {
NVKERN_COMMIT_HASH=81fe4fb417c8ac3b9bdcc1d56827d116743892a5
NVKERN_SRC=/tmp/open-gpu-kernel-modules-$NVKERN_COMMIT_HASH
if [ ! -d "$NVKERN_SRC" ]; then
git clone https://github.com/NVIDIA/open-gpu-kernel-modules $NVKERN_SRC
pushd .
cd $NVKERN_SRC
git reset --hard $NVKERN_COMMIT_HASH
popd
fi
clang2py -k cdefstum \
extra/nv_gpu_driver/clc6c0qmd.h \
extra/nv_gpu_driver/clcec0qmd.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl0000.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl0080.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl2080.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl2080_notification.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc56f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc86f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc96f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc761.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/cl83de.h \
$NVKERN_SRC/src/nvidia/generated/g_allclasses.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clc6c0.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/class/clcdc0.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/clc6b5.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/clc9b5.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_ioctl.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/uvm_linux_ioctl.h \
$NVKERN_SRC/kernel-open/nvidia-uvm/hwref/ampere/ga100/dev_fault.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv_escape.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numbers.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-ioctl-numa.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include/nv-unix-nvos-params-wrappers.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/alloc/alloc_channel.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/nvos.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0000/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl0080/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl2080/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl83de/*.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrlc36f.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrlcb33.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrla06c.h \
$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl/ctrl90f1.h \
--clang-args="-include $NVKERN_SRC/src/common/sdk/nvidia/inc/nvtypes.h -I$NVKERN_SRC/src/common/inc -I$NVKERN_SRC/kernel-open/nvidia-uvm -I$NVKERN_SRC/kernel-open/common/inc -I$NVKERN_SRC/src/common/sdk/nvidia/inc -I$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include -I$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl" \
-o $BASE/nv_gpu.py
fixup $BASE/nv_gpu.py
sed -i "s\(0000000001)\1\g" $BASE/nv_gpu.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/nv_gpu.py
sed -i 's/#\?\s\([A-Za-z0-9_]\+\) = MW ( \([0-9]\+\) : \([0-9]\+\) )/\1 = (\2 , \3)/' $BASE/nv_gpu.py # NVC6C0_QMDV03_00 processing
sed -i 's/#\sdef NVC6C0_QMD\([A-Za-z0-9_()]\+\):/def NVC6C0_QMD\1:/' $BASE/nv_gpu.py
sed -i 's/#\sdef NVCEC0_QMD\([A-Za-z0-9_()]\+\):/def NVCEC0_QMD\1:/' $BASE/nv_gpu.py
sed -E -i -n '/^def (NVCEC0_QMDV05_00_RELEASE)(_ENABLE)\(i\):/{p;s//\1'"0"'\2=\1\2(0)\n\1'"1"'\2=\1\2(1)/;H;b};p;${x;s/^\n//;p}' "$BASE/nv_gpu.py"
sed -i 's/#\s*return MW(\([0-9i()*+]\+\):\([0-9i()*+]\+\))/ return (\1 , \2)/' $BASE/nv_gpu.py
sed -i 's/#\?\s*\(.*\)\s*=\s*\(NV\)\?BIT\(32\)\?\s*(\s*\([0-9]\+\)\s*)/\1 = (1 << \4)/' $BASE/nv_gpu.py # name = BIT(x) -> name = (1 << x)
sed -i "s/UVM_\([A-Za-z0-9_]\+\) = \['i', '(', '\([0-9]\+\)', ')'\]/UVM_\1 = \2/" $BASE/nv_gpu.py # UVM_name = ['i', '(', '<num>', ')'] -> UVM_name = <num>
# Parse status codes
sed -n '1i\
nv_status_codes = {}
/^NV_STATUS_CODE/ { s/^NV_STATUS_CODE(\([^,]*\), *\([^,]*\), *"\([^"]*\)") *.*$/\1 = \2\nnv_status_codes[\1] = "\3"/; p }' $NVKERN_SRC/src/common/sdk/nvidia/inc/nvstatuscodes.h >> $BASE/nv_gpu.py
python3 -c "import tinygrad.runtime.autogen.nv_gpu"
clang2py -k cdefstum \
$NVKERN_SRC/src/nvidia/inc/kernel/gpu/fsp/kern_fsp_cot_payload.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/common/inc/gsp/gspifpub.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/common/inc/gsp/gsp_fw_wpr_meta.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/common/inc/gsp/gsp_fw_sr_meta.h \
$NVKERN_SRC/src/nvidia/inc/kernel/gpu/gsp/gsp_init_args.h \
$NVKERN_SRC/src/nvidia/inc/kernel/gpu/gsp/gsp_init_args.h \
$NVKERN_SRC/src/common/uproc/os/common/include/libos_init_args.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/common/inc/rmRiscvUcode.h \
$NVKERN_SRC/src/common/shared/msgq/inc/msgq/msgq_priv.h \
$NVKERN_SRC/src/nvidia/inc/kernel/vgpu/rpc_headers.h \
$NVKERN_SRC/src/nvidia/inc/kernel/vgpu/rpc_global_enums.h \
$NVKERN_SRC/src/nvidia/generated/g_rpc-structures.h \
$NVKERN_SRC/src/nvidia/arch/nvalloc/common/inc/fsp/fsp_nvdm_format.h \
extra/nv_gpu_driver/g_rpc-message-header.h \
extra/nv_gpu_driver/gsp_static_config.h \
extra/nv_gpu_driver/vbios.h \
extra/nv_gpu_driver/pci_exp_table.h \
--clang-args="-DRPC_MESSAGE_STRUCTURES -DRPC_STRUCTURES -include $NVKERN_SRC/src/common/sdk/nvidia/inc/nvtypes.h -I$NVKERN_SRC/src/nvidia/generated -I$NVKERN_SRC/src/common/inc -I$NVKERN_SRC/src/nvidia/inc -I$NVKERN_SRC/src/nvidia/interface/ -I$NVKERN_SRC/src/nvidia/inc/kernel -I$NVKERN_SRC/src/nvidia/inc/libraries -I$NVKERN_SRC/src/nvidia/arch/nvalloc/common/inc -I$NVKERN_SRC/kernel-open/nvidia-uvm -I$NVKERN_SRC/kernel-open/common/inc -I$NVKERN_SRC/src/common/sdk/nvidia/inc -I$NVKERN_SRC/src/nvidia/arch/nvalloc/unix/include -I$NVKERN_SRC/src/common/sdk/nvidia/inc/ctrl" \
-o $BASE/nv/nv.py
fixup $BASE/nv/nv.py
python3 -c "import tinygrad.runtime.autogen.nv.nv"
}
generate_amd() {
# clang2py broken when pass -x c++ to prev headers
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
extra/hip_gpu_driver/nvd.h \
extra/hip_gpu_driver/gc_11_0_0_offset.h \
extra/hip_gpu_driver/sienna_cichlid_ip_offset.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/amd_gpu.py
fixup $BASE/amd_gpu.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/amd_gpu.py
python3 -c "import tinygrad.runtime.autogen.amd_gpu"
}
generate_hsa() {
clang2py \
/opt/rocm/include/hsa/hsa.h \
/opt/rocm/include/hsa/hsa_ext_amd.h \
/opt/rocm/include/hsa/amd_hsa_signal.h \
/opt/rocm/include/hsa/amd_hsa_queue.h \
/opt/rocm/include/hsa/amd_hsa_kernel_code.h \
/opt/rocm/include/hsa/hsa_ext_finalize.h /opt/rocm/include/hsa/hsa_ext_image.h \
/opt/rocm/include/hsa/hsa_ven_amd_aqlprofile.h \
--clang-args="-I/opt/rocm/include" \
-o $BASE/hsa.py -l /opt/rocm/lib/libhsa-runtime64.so
fixup $BASE/hsa.py
sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/hsa.py
sed -i "s\ctypes.CDLL('/opt/rocm/lib/libhsa-runtime64.so')\ctypes.CDLL(os.getenv('ROCM_PATH')+'/lib/libhsa-runtime64.so' if os.getenv('ROCM_PATH') else ctypes.util.find_library('hsa-runtime64'))\g" $BASE/hsa.py
python3 -c "import tinygrad.runtime.autogen.hsa"
}
generate_io_uring() {
clang2py -k cdefstum \
/usr/include/liburing.h \
/usr/include/linux/io_uring.h \
-o $BASE/io_uring.py
sed -r '/^#define __NR_io_uring/ s/^#define __(NR_io_uring[^ ]+) (.*)$/\1 = \2/; t; d' /usr/include/asm-generic/unistd.h >> $BASE/io_uring.py # io_uring syscalls numbers
fixup $BASE/io_uring.py
}
generate_ib() {
clang2py -k cdefstum \
/usr/include/infiniband/verbs.h \
/usr/include/infiniband/verbs_api.h \
/usr/include/infiniband/ib_user_ioctl_verbs.h \
/usr/include/rdma/ib_user_verbs.h \
-o $BASE/ib.py
sed -i "s\import ctypes\import ctypes, ctypes.util\g" "$BASE/ib.py"
sed -i "s\FIXME_STUB\libibverbs\g" "$BASE/ib.py"
sed -i "s\FunctionFactoryStub()\ctypes.CDLL(ctypes.util.find_library('ibverbs'), use_errno=True)\g" "$BASE/ib.py"
fixup $BASE/ib.py
}
generate_llvm() {
INC="$(llvm-config-14 --includedir)"
clang2py -k cdefstum \
$(find "$INC/llvm-c/" -type f -name '*.h' | sort) \
"$INC/llvm/Config/Targets.def" \
"$INC/llvm/Config/AsmPrinters.def" \
"$INC/llvm/Config/AsmParsers.def" \
"$INC/llvm/Config/Disassemblers.def" \
--clang-args="$(llvm-config-14 --cflags)" \
-o "$BASE/llvm.py"
sed -i "s\import ctypes\import ctypes, tinygrad.runtime.support.llvm as llvm_support\g" "$BASE/llvm.py"
sed -i "s\FIXME_STUB\llvm\g" "$BASE/llvm.py"
sed -i "s\FunctionFactoryStub()\ctypes.CDLL(llvm_support.LLVM_PATH)\g" "$BASE/llvm.py"
fixup "$BASE/llvm.py"
}
generate_kgsl() {
clang2py extra/qcom_gpu_driver/msm_kgsl.h -o $BASE/kgsl.py -k cdefstum
fixup $BASE/kgsl.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/kgsl.py
sed -nE 's/#define ([A-Za-z0-9_]+)_SHIFT\s*[^\S\r\n]*[0-9]*$/def \1(val): return (val << \1_SHIFT) \& \1_MASK/p' extra/qcom_gpu_driver/msm_kgsl.h >> $BASE/kgsl.py
sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\__fd.ioctl((__idir<<30)\g" $BASE/kgsl.py
python3 -c "import tinygrad.runtime.autogen.kgsl"
}
generate_adreno() {
clang2py extra/qcom_gpu_driver/a6xx.xml.h -o $BASE/adreno.py -k cestum
sed -nE 's/#define ([A-Za-z0-9_]+)__SHIFT\s*[^\S\r\n]*[0-9]*$/def \1(val): return (val << \1__SHIFT) \& \1__MASK/p' extra/qcom_gpu_driver/a6xx.xml.h >> $BASE/adreno.py
fixup $BASE/adreno.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/adreno.py
python3 -c "import tinygrad.runtime.autogen.adreno"
}
generate_qcom() {
clang2py -k cdefstum \
extra/dsp/include/ion.h \
extra/dsp/include/msm_ion.h \
extra/dsp/include/adsprpc_shared.h \
extra/dsp/include/remote_default.h \
extra/dsp/include/apps_std.h \
-o $BASE/qcom_dsp.py
fixup $BASE/qcom_dsp.py
python3 -c "import tinygrad.runtime.autogen.qcom_dsp"
}
generate_pci() {
clang2py -k cdefstum \
/usr/include/linux/pci_regs.h \
-o $BASE/pci.py
fixup $BASE/pci.py
}
generate_vfio() {
clang2py -k cdefstum \
/usr/include/linux/vfio.h \
-o $BASE/vfio.py
fixup $BASE/vfio.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/vfio.py
sed -i "s\import fcntl, functools\import functools" $BASE/vfio.py
sed -i "s\import ctypes,os\a from tinygrad.runtime.support import FileIOInterface\g" $BASE/vfio.py
sed -i "s\fcntl.ioctl(__fd, (__idir<<30)\return __fd.ioctl((__idir<<30)\g" $BASE/vfio.py
}
generate_am() {
AMKERN_COMMIT_HASH=ceb12c04e2b5b53ec0779362831f5ee40c4921e4
AMKERN_SRC=/tmp/ROCK-Kernel-Driver-$AMKERN_COMMIT_HASH
if [ ! -d "$AMKERN_SRC" ]; then
git clone https://github.com/ROCm/ROCK-Kernel-Driver $AMKERN_SRC --depth 1
fi
AMKERN_AMD=$AMKERN_SRC/drivers/gpu/drm/amd/
AMKERN_INC=$AMKERN_AMD/include/
clang2py -k cdefstum \
extra/amdpci/headers/v11_structs.h \
extra/amdpci/headers/v12_structs.h \
extra/amdpci/headers/amdgpu_vm.h \
extra/amdpci/headers/discovery.h \
extra/amdpci/headers/amdgpu_ucode.h \
extra/amdpci/headers/psp_gfx_if.h \
extra/amdpci/headers/amdgpu_psp.h \
extra/amdpci/headers/amdgpu_irq.h \
extra/amdpci/headers/amdgpu_doorbell.h \
$AMKERN_INC/soc15_ih_clientid.h \
--clang-args="-include stdint.h" \
-o $BASE/am/am.py
fixup $BASE/am/am.py
sed -i "s\(int64_t)\ \g" $BASE/am/am.py
sed -i "s\AMDGPU_PTE_MTYPE_VG10(2)\AMDGPU_PTE_MTYPE_VG10(0, 2)\g" $BASE/am/am.py # incorrect parsing (TODO: remove when clang2py is gone).
clang2py -k cdefstum \
$AMKERN_AMD/amdkfd/kfd_pm4_headers_ai.h \
$AMKERN_AMD/amdgpu/soc15d.h \
-o $BASE/am/pm4_soc15.py
fixup $BASE/am/pm4_soc15.py
clang2py -k cdefstum \
$AMKERN_AMD/amdkfd/kfd_pm4_headers_ai.h \
$AMKERN_AMD/amdgpu/nvd.h \
-o $BASE/am/pm4_nv.py
fixup $BASE/am/pm4_nv.py
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
$AMKERN_AMD/amdgpu/vega10_sdma_pkt_open.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/am/sdma_4_0_0.py
fixup $BASE/am/sdma_4_0_0.py
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
$AMKERN_AMD/amdgpu/navi10_sdma_pkt_open.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/am/sdma_5_0_0.py
fixup $BASE/am/sdma_5_0_0.py
clang2py -k cdefstum \
extra/hip_gpu_driver/sdma_registers.h \
$AMKERN_AMD/amdgpu/sdma_v6_0_0_pkt_open.h \
--clang-args="-I/opt/rocm/include -x c++" \
-o $BASE/am/sdma_6_0_0.py
fixup $BASE/am/sdma_6_0_0.py
clang2py -k cdefstum \
$AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu_v13_0_0_ppsmc.h \
$AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu13_driver_if_v13_0_0.h \
extra/amdpci/headers/amdgpu_smu.h \
-o $BASE/am/smu_v13_0_0.py
fixup $BASE/am/smu_v13_0_0.py
clang2py -k cdefstum \
$AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu_v14_0_0_pmfw.h \
$AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu_v14_0_2_ppsmc.h \
$AMKERN_AMD/pm/swsmu/inc/pmfw_if/smu14_driver_if_v14_0.h \
extra/amdpci/headers/amdgpu_smu.h \
--clang-args="-include stdint.h" \
-o $BASE/am/smu_v14_0_2.py
fixup $BASE/am/smu_v14_0_2.py
}
generate_sqtt() {
clang2py -k cdefstum \
extra/sqtt/sqtt.h \
-o $BASE/sqtt.py
fixup $BASE/sqtt.py
sed -i "s\import ctypes\import ctypes, os\g" $BASE/sqtt.py
python3 -c "import tinygrad.runtime.autogen.sqtt"
ROCPROF_COMMIT_HASH=dd0485100971522cc4cd8ae136bdda431061a04d
ROCPROF_SRC=/tmp/rocprof-trace-decoder-$ROCPROF_COMMIT_HASH
if [ ! -d "$ROCPROF_SRC" ]; then
git clone https://github.com/ROCm/rocprof-trace-decoder $ROCPROF_SRC
pushd .
cd $ROCPROF_SRC
git reset --hard $ROCPROF_COMMIT_HASH
popd
fi
clang2py -k cdefstum \
$ROCPROF_SRC/include/rocprof_trace_decoder.h \
$ROCPROF_SRC/include/trace_decoder_instrument.h \
$ROCPROF_SRC/include/trace_decoder_types.h \
-o $BASE/rocprof.py
fixup $BASE/rocprof.py
sed -i '1s/^/# pylint: skip-file\n/' $BASE/rocprof.py
sed -i "s/import ctypes/import ctypes, ctypes.util/g" $BASE/rocprof.py
patch_dlopen $BASE/rocprof.py rocprof-trace-decoder "'/usr/local/lib/librocprof-trace-decoder.so'" "'/usr/local/lib/librocprof-trace-decoder.dylib'"
sed -i "s/def _try_dlopen_rocprof-trace-decoder():/def _try_dlopen_rocprof_trace_decoder():/g" $BASE/rocprof.py
sed -i "s|FunctionFactoryStub()|_try_dlopen_rocprof_trace_decoder()|g" $BASE/rocprof.py
}
generate_webgpu() {
clang2py extra/webgpu/webgpu.h -o $BASE/webgpu.py
fixup $BASE/webgpu.py
sed -i "s/FIXME_STUB/webgpu/g" "$BASE/webgpu.py"
sed -i "s/FunctionFactoryStub()/ctypes.CDLL(webgpu_support.WEBGPU_PATH)/g" "$BASE/webgpu.py"
sed -i "s/import ctypes/import ctypes, tinygrad.runtime.support.webgpu as webgpu_support/g" "$BASE/webgpu.py"
python3 -c "import tinygrad.runtime.autogen.webgpu"
}
generate_libusb() {
clang2py -k cdefstum \
/usr/include/libusb-1.0/libusb.h \
-o $BASE/libusb.py
fixup $BASE/libusb.py
sed -i "s\import ctypes\import ctypes, ctypes.util, os\g" $BASE/libusb.py
sed -i "s/FIXME_STUB/libusb/g" "$BASE/libusb.py"
sed -i "s/libusb_le16_to_cpu = libusb_cpu_to_le16//g" "$BASE/libusb.py"
sed -i "s/FunctionFactoryStub()/None if (lib_path:=os.getenv('LIBUSB_PATH', ctypes.util.find_library('usb-1.0'))) is None else ctypes.CDLL(lib_path)/g" "$BASE/libusb.py"
python3 -c "import tinygrad.runtime.autogen.libusb"
}
generate_mesa() {
MESA_TAG="mesa-25.2.4"
MESA_SRC=/tmp/mesa-$MESA_TAG
TINYMESA_TAG=tinymesa-32dc66c
TINYMESA_DIR=/tmp/tinymesa-$MESA_TAG-$TINYMESA_TAG/
TINYMESA_SO=$TINYMESA_DIR/libtinymesa_cpu.so
if [ ! -d "$MESA_SRC" ]; then
git clone --depth 1 --branch $MESA_TAG https://gitlab.freedesktop.org/mesa/mesa.git $MESA_SRC
pushd .
cd $MESA_SRC
git reset --hard $MESA_COMMIT_HASH
# clang 14 doesn't support packed enums
sed -i "s/enum \w\+ \(\w\+\);$/uint8_t \1;/" $MESA_SRC/src/nouveau/headers/nv_device_info.h
sed -i "s/enum \w\+ \(\w\+\);$/uint8_t \1;/" $MESA_SRC/src/nouveau/compiler/nak.h
sed -i "s/nir_instr_type \(\w\+\);/uint8_t \1;/" $MESA_SRC/src/compiler/nir/nir.h
mkdir -p gen/util/format
python3 src/util/format/u_format_table.py src/util/format/u_format.yaml --enums > gen/util/format/u_format_gen.h
python3 src/compiler/nir/nir_opcodes_h.py > gen/nir_opcodes.h
python3 src/compiler/nir/nir_intrinsics_h.py --outdir gen
python3 src/compiler/nir/nir_intrinsics_indices_h.py --outdir gen
python3 src/compiler/nir/nir_builder_opcodes_h.py > gen/nir_builder_opcodes.h
python3 src/compiler/nir/nir_intrinsics_h.py --outdir gen
python3 src/compiler/builtin_types_h.py gen/builtin_types.h
popd
fi
if [ ! -d "$TINYMESA_DIR" ]; then
mkdir $TINYMESA_DIR
curl -L https://github.com/sirhcm/tinymesa/releases/download/$TINYMESA_TAG/libtinymesa_cpu-$MESA_TAG-linux-amd64.so -o $TINYMESA_SO
fi
clang2py -k cdefstu \
$MESA_SRC/src/compiler/nir/nir.h \
$MESA_SRC/src/compiler/nir/nir_builder.h \
$MESA_SRC/src/compiler/nir/nir_shader_compiler_options.h \
$MESA_SRC/src/compiler/nir/nir_serialize.h \
$MESA_SRC/gen/nir_intrinsics.h \
$MESA_SRC/src/nouveau/headers/nv_device_info.h \
$MESA_SRC/src/nouveau/compiler/nak.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_passmgr.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_misc.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_type.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_init.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_nir.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_struct.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_jit_types.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_flow.h \
$MESA_SRC/src/gallium/auxiliary/gallivm/lp_bld_const.h \
$MESA_SRC/src/compiler/glsl_types.h \
$MESA_SRC/src/util/blob.h \
$MESA_SRC/src/util/ralloc.h \
--clang-args="-DHAVE_ENDIAN_H -DHAVE_STRUCT_TIMESPEC -DHAVE_PTHREAD -I$MESA_SRC/src -I$MESA_SRC/include -I$MESA_SRC/gen -I$MESA_SRC/src/compiler/nir -I$MESA_SRC/src/gallium/auxiliary -I$MESA_SRC/src/gallium/include -I$(llvm-config-20 --includedir)" \
-l $TINYMESA_SO \
-o $BASE/mesa.py
LVP_NIR_OPTIONS=$(./extra/mesa/lvp_nir_options.sh $MESA_SRC)
fixup $BASE/mesa.py
patch_dlopen $BASE/mesa.py tinymesa_cpu "(BASE:=os.getenv('MESA_PATH', f\"/usr{'/local/' if helpers.OSX else '/'}lib\"))+'/libtinymesa_cpu'+(EXT:='.dylib' if helpers.OSX else '.so')" "f'{BASE}/libtinymesa{EXT}'" "'/opt/homebrew/lib/libtinymesa_cpu.dylib'" "'/opt/homebrew/lib/libtinymesa.dylib'"
echo "lvp_nir_options = gzip.decompress(base64.b64decode('$LVP_NIR_OPTIONS'))" >> $BASE/mesa.py
sed -i "/in_dll/s/.*/try: &\nexcept (AttributeError, ValueError): pass/" $BASE/mesa.py
sed -i "s/import ctypes/import ctypes, ctypes.util, os, gzip, base64, subprocess, tinygrad.helpers as helpers/" $BASE/mesa.py
sed -i "s/ctypes.CDLL('.\+')/(dll := _try_dlopen_tinymesa_cpu())/" $BASE/mesa.py
echo "def __getattr__(nm): raise AttributeError('LLVMpipe requires tinymesa_cpu' if 'tinymesa_cpu' not in dll._name else f'attribute {nm} not found') if dll else FileNotFoundError(f'libtinymesa not found (MESA_PATH={BASE}). See https://github.com/sirhcm/tinymesa ($TINYMESA_TAG, $MESA_TAG)')" >> $BASE/mesa.py
sed -i "s/ctypes.glsl_base_type/glsl_base_type/" $BASE/mesa.py
# bitfield bug in clang2py
sed -i "s/('fp_fast_math', ctypes.c_bool, 9)/('fp_fast_math', ctypes.c_uint32, 9)/" $BASE/mesa.py
sed -i "s/('\(\w\+\)', pipe_shader_type, 8)/('\1', ctypes.c_ubyte)/" $BASE/mesa.py
sed -i "s/\([0-9]\+\)()/\1/" $BASE/mesa.py
sed -i '/struct_nir_builder._pack_ = 1 # source:False/d' "$BASE/mesa.py"
python3 -c "import tinygrad.runtime.autogen.mesa"
}
if [ "$1" == "opencl" ]; then generate_opencl
elif [ "$1" == "hip" ]; then generate_hip
elif [ "$1" == "comgr" ]; then generate_comgr
elif [ "$1" == "cuda" ]; then generate_cuda
elif [ "$1" == "nvrtc" ]; then generate_nvrtc
elif [ "$1" == "hsa" ]; then generate_hsa
elif [ "$1" == "kfd" ]; then generate_kfd
elif [ "$1" == "nv" ]; then generate_nv
elif [ "$1" == "amd" ]; then generate_amd
elif [ "$1" == "am" ]; then generate_am
elif [ "$1" == "sqtt" ]; then generate_sqtt
elif [ "$1" == "qcom" ]; then generate_qcom
elif [ "$1" == "io_uring" ]; then generate_io_uring
elif [ "$1" == "ib" ]; then generate_ib
elif [ "$1" == "llvm" ]; then generate_llvm
elif [ "$1" == "kgsl" ]; then generate_kgsl
elif [ "$1" == "adreno" ]; then generate_adreno
elif [ "$1" == "pci" ]; then generate_pci
elif [ "$1" == "vfio" ]; then generate_vfio
elif [ "$1" == "webgpu" ]; then generate_webgpu
elif [ "$1" == "libusb" ]; then generate_libusb
elif [ "$1" == "mesa" ]; then generate_mesa
elif [ "$1" == "all" ]; then generate_opencl; generate_hip; generate_comgr; generate_cuda; generate_nvrtc; generate_hsa; generate_kfd; generate_nv; generate_amd; generate_io_uring; generate_am; generate_webgpu; generate_mesa
else echo "usage: $0 <type>"
fi

View file

@ -19,5 +19,6 @@ trap 'rm -f "$TMP"' EXIT
EOF
sed -n '/struct nir_shader_compiler_options/,/^}/{p;/^}/q}' $1/src/gallium/drivers/llvmpipe/lp_screen.c
echo "int main(void) { write(1, &gallivm_nir_options, sizeof(gallivm_nir_options)); }"
) | cc -x c -o $TMP - -I$1/src/compiler/nir -I$1/src -I$1/include && $TMP | gzip | base64 -w0
) | cc -x c -o $TMP - -I$1/src/compiler/nir -I$1/src -I$1/include || exit 1
printf 'lvp_nir_options = gzip.decompress(base64.b64decode("%s"))' $("$TMP" | gzip | base64 -w0)

53
extra/nvJitLink.h Normal file
View file

@ -0,0 +1,53 @@
/*
* NVIDIA_COPYRIGHT_BEGIN
*
* Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION and its licensors retain all intellectual property
* and proprietary rights in and to this software, related documentation
* and any modifications thereto. Any use, reproduction, disclosure or
* distribution of this software and related documentation without an express
* license agreement from NVIDIA CORPORATION is strictly prohibited.
*
* NVIDIA_COPYRIGHT_END
*/
#include <stdint.h>
#include <stdlib.h>
typedef enum {
NVJITLINK_SUCCESS = 0,
NVJITLINK_ERROR_UNRECOGNIZED_OPTION,
NVJITLINK_ERROR_MISSING_ARCH,
NVJITLINK_ERROR_INVALID_INPUT,
NVJITLINK_ERROR_PTX_COMPILE,
NVJITLINK_ERROR_NVVM_COMPILE,
NVJITLINK_ERROR_INTERNAL
} nvJitLinkResult;
typedef enum {
NVJITLINK_INPUT_NONE = 0,
NVJITLINK_INPUT_CUBIN = 1,
NVJITLINK_INPUT_PTX,
NVJITLINK_INPUT_LTOIR,
NVJITLINK_INPUT_FATBIN,
NVJITLINK_INPUT_OBJECT,
NVJITLINK_INPUT_LIBRARY
} nvJitLinkInputType;
typedef struct nvJitLink* nvJitLinkHandle;
nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle *handle, uint32_t numOptions, const char **options);
nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle *handle);
nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void *data, size_t size, const char *name);
nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char *fileName);
nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle);
nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t *size);
nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void *cubin);
nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t *size);
nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char *ptx);
nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t *size);
nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char *log);
nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t *size);
nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char *log);
nvJitLinkResult nvJitLinkVersion(unsigned int *major, unsigned int *minor);

View file

@ -65,6 +65,8 @@
#define NVCEC0_QMDV05_00_GRID_HEIGHT_RESUME MW(271:256)
#define NVCEC0_QMDV05_00_GRID_DEPTH_RESUME MW(287:272)
#define NVCEC0_QMDV05_00_RELEASE_ENABLE(i) MW((288+(i)*16):(288+(i)*16))
#define NVCEC0_QMDV05_00_RELEASE0_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(0)
#define NVCEC0_QMDV05_00_RELEASE1_ENABLE NVCEC0_QMDV05_00_RELEASE_ENABLE(1)
#define NVCEC0_QMDV05_00_RELEASE_ENABLE_FALSE 0x00000000
#define NVCEC0_QMDV05_00_RELEASE_ENABLE_TRUE 0x00000001
#define NVCEC0_QMDV05_00_RELEASE_STRUCTURE_SIZE(i) MW((290+(i)*16):(289+(i)*16))

View file

@ -58,7 +58,7 @@ def install_hook(c_function, python_function):
return orig_func
# *** ioctl lib end ***
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
from tinygrad.runtime.autogen import nv_gpu
nvescs = {getattr(nv_gpu, x):x for x in dir(nv_gpu) if x.startswith("NV_ESC")}
nvcmds = {getattr(nv_gpu, x):(x, getattr(nv_gpu, "struct_"+x+"_PARAMS", getattr(nv_gpu, "struct_"+x.replace("_CMD_", "_")+"_PARAMS", None))) for x in dir(nv_gpu) if \
x.startswith("NV") and x[6:].startswith("_CTRL_") and isinstance(getattr(nv_gpu, x), int)}
@ -272,4 +272,4 @@ def compare_launch_state(states, good_states):
return True, "PASS"
# IOCTL=1 CUDA=1 CUDA_PTX=1 python3 test/test_ops.py TestOps.test_tiny_add
# IOCTL=1 CUDA=1 CUDA_PTX=1 python3 test/test_ops.py TestOps.test_tiny_add

View file

@ -185,9 +185,7 @@ class RGP:
magic_number=sqtt.SQTT_FILE_MAGIC_NUMBER,
version_major=sqtt.SQTT_FILE_VERSION_MAJOR,
version_minor=sqtt.SQTT_FILE_VERSION_MINOR,
flags=sqtt.struct_sqtt_file_header_flags(
_0=sqtt.union_sqtt_file_header_flags_0(value=1),
),
flags=sqtt.struct_sqtt_file_header_flags(value=1,),
chunk_offset=ctypes.sizeof(sqtt.struct_sqtt_file_header),
)
chunks = [
@ -265,7 +263,7 @@ class RGP:
profiling_mode=sqtt.SQTT_PROFILING_MODE_PRESENT,
instruction_trace_mode=sqtt.SQTT_INSTRUCTION_TRACE_FULL_FRAME if sqtt_itrace_enabled else sqtt.SQTT_INSTRUCTION_TRACE_DISABLED,
instruction_trace_data=sqtt.union_sqtt_instruction_trace_data(
shader_engine_filter=sqtt.struct_sqtt_instruction_trace_data_shader_engine_filter(mask=sqtt_itrace_se_mask),
shader_engine_filter=sqtt.union_sqtt_instruction_trace_data_shader_engine_filter(mask=sqtt_itrace_se_mask),
),
)),
*flatten([(
@ -276,13 +274,11 @@ class RGP:
),
shader_engine_index=sqtt_event.se,
sqtt_version={11: sqtt.SQTT_VERSION_3_2, 12: sqtt.SQTT_VERSION_3_3}.get(gfx_ver),
_0=sqtt.union_sqtt_file_chunk_sqtt_desc_0(
v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1(
instrumentation_spec_version=1,
instrumentation_api_version=0,
compute_unit_index=0,
)
),
v1=sqtt.struct_sqtt_file_chunk_sqtt_desc_0_v1(
instrumentation_spec_version=1,
instrumentation_api_version=0,
compute_unit_index=0,
)
)),
RGPChunk(sqtt.struct_sqtt_file_chunk_sqtt_data(
header=sqtt.struct_sqtt_file_chunk_header(

View file

@ -30,7 +30,6 @@ packages = [
'tinygrad.runtime',
'tinygrad.runtime.autogen',
'tinygrad.runtime.autogen.am',
'tinygrad.runtime.autogen.nv',
'tinygrad.runtime.graph',
'tinygrad.runtime.support',
'tinygrad.runtime.support.am',
@ -162,7 +161,6 @@ exclude = [
".git/",
"docs/",
"extra/",
"tinygrad/runtime/autogen",
"test/external/mlperf_resnet",
"test/external/mlperf_unet3d",
]
@ -228,6 +226,7 @@ select = [
"F541",
"F841",
]
"tinygrad/runtime/autogen/**/*.py" = ["E501", "F401", "E722", "E731", "F821", "A006"]
[tool.ruff.format]
exclude = ["*"]

View file

@ -33,7 +33,7 @@ remu = _try_dlopen_remu()
def create_sdma_packets():
# TODO: clean up this, if we want to keep it
structs = {}
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("struct_SDMA_PKT_") and name.endswith("_TAG")]:
for name,pkt in [(name,s) for name,s in amd_gpu.__dict__.items() if name.startswith("rocr_AMD_SDMA_PKT_") and name.endswith("_TAG")]:
names = set()
fields = []
for pkt_fields in pkt._fields_:
@ -47,7 +47,7 @@ def create_sdma_packets():
# merge together 64-bit fields, otherwise just append them
if fname.endswith("_63_32") and fields[-1][0].endswith("_31_0"): fields[-1] = tuple([fname[:-6], ctypes.c_ulong, 64])
else: fields.append(tuple([fname, *union_fields[1:]]))
new_name = name[16:-4].lower()
new_name = name[18:-4].lower()
structs[new_name] = init_c_struct_t(tuple(fields))
assert ctypes.sizeof(structs[new_name]) == ctypes.sizeof(pkt), f"{ctypes.sizeof(structs[new_name])} != {ctypes.sizeof(pkt)}"
return type("SDMA_PKTS", (object, ), structs)

View file

@ -164,7 +164,7 @@ def cuStreamWaitEvent(stream: Any, event, flags: int) -> int: return orig_cuda.C
def cuCtxSynchronize() -> int: return orig_cuda.CUDA_SUCCESS
def cuGetErrorString(error: int, pStr) -> int:
error_str = orig_cuda.cudaError_enum__enumvalues.get(error, "Unknown CUDA error").encode()
error_str = orig_cuda.enum_cudaError_enum.get(error, "Unknown CUDA error").encode()
buf = ctypes.create_string_buffer(error_str)
# Set the pointer to point to our error string buffer
pStr._obj.value = ctypes.cast(buf, ctypes.POINTER(ctypes.c_char))

View file

@ -1,5 +1,5 @@
import ctypes, mmap, collections, functools, os
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
from tinygrad.runtime.autogen import nv_gpu
from typing import Any
from tinygrad.helpers import to_mv
from test.mockgpu.driver import VirtDriver, VirtFileDesc, VirtFile
@ -254,4 +254,4 @@ class NVDriver(VirtDriver):
for gpu in self.gpus.values():
for q in gpu.queues:
if q.ctrl.GPGet != q.ctrl.GPPut:
any_progress |= q.execute()
any_progress |= q.execute()

View file

@ -1,5 +1,5 @@
import ctypes, time
import tinygrad.runtime.autogen.nv_gpu as nv_gpu
from tinygrad.runtime.autogen import nv_gpu
from enum import Enum, auto
from test.mockgpu.gpu import VirtGPU
from test.mockgpu.helpers import _try_dlopen_gpuocelot

View file

@ -416,6 +416,7 @@ def to_mv(ptr:int, sz:int) -> memoryview: return memoryview((ctypes.c_uint8 * sz
def mv_address(mv): return ctypes.addressof(ctypes.c_char.from_buffer(mv))
def to_char_p_p(options: list[bytes], to_type=ctypes.c_char):
return (ctypes.POINTER(to_type) * len(options))(*[ctypes.cast(ctypes.create_string_buffer(o), ctypes.POINTER(to_type)) for o in options])
def charptr(s:str|bytes): return ctypes.cast(ctypes.c_char_p(s if isinstance(s, bytes) else s.encode()), ctypes.POINTER(ctypes.c_char))
@functools.cache
def init_c_struct_t(fields: tuple[tuple[str, type[ctypes._SimpleCData]], ...]):
class CStruct(ctypes.Structure):

View file

@ -1,10 +1,10 @@
from typing import Callable, cast, Any
from tinygrad.dtype import AddrSpace, DType, PtrDType, dtypes
from tinygrad.helpers import DEBUG, OSX, unwrap
from tinygrad.helpers import DEBUG, OSX, unwrap, charptr
from tinygrad.renderer import Renderer
from tinygrad.renderer.cstyle import CUDARenderer
from tinygrad.uop.ops import GroupOp, Ops, UOp, PatternMatcher, UPat, range_str
import tinygrad.runtime.autogen.mesa as mesa
from tinygrad.runtime.autogen import mesa
import base64, ctypes, ctypes.util, struct, functools, inspect
def g(s:str): return getattr(mesa, s)
@ -51,7 +51,7 @@ def nir_instr(nc=1, bs=lambda: None, intrins=None, srcs=None, has_def=True, df=N
instr = f(*args, **kwargs)
if has_def: mesa.nir_def_init(instr.contents.instr, getattr(instr.contents, "def"), go(nc), go(bs))
for k, v in go(intrins or {}).items():
idx = mesa.nir_intrinsic_infos[instr.contents.intrinsic].index_map[g(f"NIR_INTRINSIC_{k}")]
idx = mesa.nir_intrinsic_infos[instr.contents.intrinsic.value].index_map[g(f"NIR_INTRINSIC_{k}")]
assert idx > 0
instr.contents.const_index[idx - 1] = go(v)
for i, src in enumerate(go(srcs or [])): ctypes.cast(instr.contents.src, ctypes.POINTER(mesa.nir_src))[i] = go(src)
@ -177,7 +177,7 @@ class NIRRenderer(Renderer):
elif u.op is Ops.AFTER:
self.r[u] = self.r[u.src[0]]
elif u.op == Ops.SINK:
if u.arg is not None: self.b.shader.contents.info.name = mesa.char_pointer_cast(u.arg.function_name)
if u.arg is not None: self.b.shader.contents.info.name = charptr(u.arg.function_name.encode())
elif u.op == Ops.DEFINE_LOCAL:
self.r[u] = nimm(self.b, self.b.shader.contents.info.shared_size, dtypes.long)
self.b.shader.contents.info.shared_size += u.dtype.nbytes()

View file

@ -1,11 +1,22 @@
import importlib, pathlib
from tinygrad.helpers import system
import glob, importlib, pathlib, subprocess, tarfile
from tinygrad.helpers import fetch, flatten, system
root = (here:=pathlib.Path(__file__).parent).parents[2]
nv_src = "https://github.com/NVIDIA/open-gpu-kernel-modules/archive/81fe4fb417c8ac3b9bdcc1d56827d116743892a5.tar.gz"
def load(name, dll, files, **kwargs):
if not (f:=(root/(path:=kwargs.pop("path", __name__)).replace('.','/')/f"{name}.py")).exists():
files = files() if callable(files) else files
files, kwargs['args'] = files() if callable(files) else files, args() if callable(args:=kwargs.get('args', [])) else args
if (tarball:=kwargs.pop('tarball', None)):
# dangerous for arbitrary urls!
with tarfile.open(fetch(tarball, gunzip=tarball.endswith("gz"))) as tf:
tf.extractall("/tmp")
base = f"/tmp/{tf.getnames()[0]}"
files, kwargs['args'] = [str(f).format(base) for f in files], [a.format(base) for a in kwargs.get('args', [])]
kwargs['anon_names'] = {k.format(base):v for k,v in kwargs.get('anon_names', {}).items()}
if (preprocess:=kwargs.pop('preprocess', None)): preprocess(base)
files = flatten(sorted(glob.glob(p, recursive=True)) if isinstance(p, str) and '*' in p else [p] for p in files)
kwargs['epilog'] = (epi(base) if tarball else epi()) if callable(epi:=kwargs.get('epilog', [])) else epi
f.write_text(importlib.import_module("tinygrad.runtime.support.autogen").gen(dll, files, **kwargs))
return importlib.import_module(f"{path}.{name.replace('/', '.')}")
@ -14,4 +25,100 @@ def __getattr__(nm):
case "libc": return load("libc", ["find_library('c')"], lambda: (
[i for i in system("dpkg -L libc6-dev").split() if 'sys/mman.h' in i or 'sys/syscall.h' in i] +
["/usr/include/string.h", "/usr/include/elf.h", "/usr/include/unistd.h", "/usr/include/asm-generic/mman-common.h"]), use_errno=True)
case "opencl": return load("opencl", ["find_library('OpenCL')"], ["/usr/include/CL/cl.h"])
case "cuda": return load("cuda", ["find_library('cuda')"], ["/usr/include/cuda.h"], args=["-D__CUDA_API_VERSION_INTERNAL"], parse_macros=False)
case "nvrtc": return load("nvrtc", ["find_library('nvrtc')"], ["/usr/include/nvrtc.h"])
case "nvjitlink": load("nvjitlink", ["find_library('nvJitLink')"], [root/"extra/nvJitLink.h"])
case "kfd": return load("kfd", [], ["/usr/include/linux/kfd_ioctl.h"])
case "nv_gpu":
return load("nv_gpu", [], [
*[root/"extra/nv_gpu_driver"/s for s in ["clc6c0qmd.h","clcec0qmd.h"]], "{}/kernel-open/common/inc/nvmisc.h",
*[f"{{}}/src/common/sdk/nvidia/inc/class/cl{s}.h" for s in ["0000", "0080", "2080", "2080_notification", "c56f", "c86f", "c96f", "c761",
"83de", "c6c0", "cdc0"]],
*[f"{{}}/kernel-open/nvidia-uvm/{s}.h" for s in ["clc6b5", "clc9b5", "uvm_ioctl", "uvm_linux_ioctl", "hwref/ampere/ga100/dev_fault"]],
*[f"{{}}/src/nvidia/arch/nvalloc/unix/include/nv{s}.h" for s in ["_escape", "-ioctl", "-ioctl-numbers",
"-ioctl-numa", "-unix-nvos-params-wrappers"]],
*[f"{{}}/src/common/sdk/nvidia/inc/{s}.h" for s in ["alloc/alloc_channel", "nvos", "ctrl/ctrlc36f", "ctrl/ctrlcb33",
"ctrl/ctrla06c", "ctrl/ctrl90f1"]],
*[f"{{}}/src/common/sdk/nvidia/inc/ctrl/ctrl{s}/*.h" for s in ["0000", "0080", "2080", "83de"]],
"{}/kernel-open/common/inc/nvstatus.h", "{}/src/nvidia/generated/g_allclasses.h"
], args=[
"-include", "{}/src/common/sdk/nvidia/inc/nvtypes.h", "-I{}/src/common/inc", "-I{}/kernel-open/nvidia-uvm", "-I{}/kernel-open/common/inc",
"-I{}/src/common/sdk/nvidia/inc", "-I{}/src/nvidia/arch/nvalloc/unix/include", "-I{}/src/common/sdk/nvidia/inc/ctrl"
], rules=[(r'MW\(([^:]+):(.+)\)',r'(\1, \2)')], tarball=nv_src, anon_names={"{}/kernel-open/common/inc/nvstatus.h:37":"nv_status_codes"})
case "nv": return load("nv", [], [
*[f"{{}}/src/nvidia/inc/kernel/gpu/{s}.h" for s in ["fsp/kern_fsp_cot_payload", "gsp/gsp_init_args"]],
*[f"{{}}/src/nvidia/arch/nvalloc/common/inc/{s}.h" for s in ["gsp/gspifpub", "gsp/gsp_fw_wpr_meta", "gsp/gsp_fw_sr_meta", "rmRiscvUcode",
"fsp/fsp_nvdm_format"]],
*[f"{{}}/src/nvidia/inc/kernel/vgpu/{s}.h" for s in ["rpc_headers", "rpc_global_enums"]],
"{}/src/common/uproc/os/common/include/libos_init_args.h", "{}/src/common/shared/msgq/inc/msgq/msgq_priv.h",
"{}/src/nvidia/generated/g_rpc-structures.h", root/"extra/nv_gpu_driver/g_rpc-message-header.h", root/"extra/nv_gpu_driver/gsp_static_config.h",
root/"extra/nv_gpu_driver/vbios.h", root/"extra/nv_gpu_driver/pci_exp_table.h"
], args=[
"-DRPC_MESSAGE_STRUCTURES", "-DRPC_STRUCTURES", "-include", "{}/src/common/sdk/nvidia/inc/nvtypes.h", "-I{}/src/nvidia/generated",
"-I{}/src/common/inc", "-I{}/src/nvidia/inc", "-I{}/src/nvidia/interface/", "-I{}/src/nvidia/inc/kernel", "-I{}/src/nvidia/inc/libraries",
"-I{}/src/nvidia/arch/nvalloc/common/inc", "-I{}/kernel-open/nvidia-uvm", "-I{}/kernel-open/common/inc", "-I{}/src/common/sdk/nvidia/inc",
"-I{}/src/nvidia/arch/nvalloc/unix/include", "-I{}/src/common/sdk/nvidia/inc/ctrl"
], tarball=nv_src, anon_names={
"{}/src/nvidia/inc/kernel/vgpu/rpc_global_enums.h:8": "rpc_fns",
"{}/src/nvidia/inc/kernel/vgpu/rpc_global_enums.h:244": "rpc_events"
})
# this defines all syscall numbers. should probably unify linux autogen?
case "io_uring": return load("io_uring", [], ["/usr/include/liburing.h", "/usr/include/linux/io_uring.h", "/usr/include/asm-generic/unistd.h"],
rules=[('__NR', 'NR')])
case "ib": return load("ib", ["ibverbs"], ["/usr/include/infiniband/verbs.h", "/usr/include/infiniband/verbs_api.h",
"/usr/include/infiniband/ib_user_ioctl_verbs.h","/usr/include/rdma/ib_user_verbs.h"], use_errno=True)
case "llvm": return load("llvm", ["LLVM_PATH"], lambda: [system("llvm-config-20 --includedir")+"/llvm-c/**/*.h"],
args=lambda: system("llvm-config-20 --cflags").split(), recsym=True,
prolog=["from tinygrad.runtime.support.llvm import LLVM_PATH"])
case "pci": return load("pci", [], ["/usr/include/linux/pci_regs.h"])
case "vfio": return load("vfio", [], ["/usr/include/linux/vfio.h"])
# could add rule: WGPU_COMMA -> ','
case "webgpu":
return load("webgpu", ["WEBGPU_PATH"], [root/"extra/webgpu/webgpu.h"], prolog=["from tinygrad.runtime.support.webgpu import WEBGPU_PATH"])
case "libusb": return load("libusb", ["os.getenv('LIBUSB_PATH', find_library('usb-1.0'))"], ["/usr/include/libusb-1.0/libusb.h"])
case "hip": return load("hip", ["os.getenv('ROCM_PATH', '/opt/rocm')+'/lib/libamdhip64.so'"], ["/opt/rocm/include/hip/hip_ext.h",
"/opt/rocm/include/hip/hiprtc.h", "/opt/rocm/include/hip/hip_runtime_api.h", "/opt/rocm/include/hip/driver_types.h"],
args=["-D__HIP_PLATFORM_AMD__", "-I/opt/rocm/include", "-x", "c++"])
case "comgr" | "comgr_3":
try: use_3 = nm == "comgr_3" or int(system("dpkg-query -f '${version}' -W comgr")[1]) >= 3
except FileNotFoundError: use_3 = nm == "comgr_3"
return load("comgr_3" if use_3 else "comgr", [
"os.getenv('ROCM_PATH', '/opt/rocm')+'/lib/libamd_comgr.so'", "'/usr/local/lib/libamd_comgr.dylib'", "'/opt/homebrew/lib/libamd_comgr.dylib'"
], ["/opt/rocm/include/amd_comgr/amd_comgr.h"], args=["-D__HIP_PLATFORM_AMD__", "-I/opt/rocm/include", "-x", "c++"])
case "hsa": return load("hsa", ["os.getenv('ROCM_PATH', '/opt/rocm')+'/lib/libhsa-runtime64.so'", "find_library('hsa-runtime64')"], [
f"/opt/rocm/include/hsa/{s}.h" for s in ["hsa", "hsa_ext_amd", "amd_hsa_signal", "amd_hsa_queue", "amd_hsa_kernel_code", "hsa_ext_finalize",
"hsa_ext_image", "hsa_ven_amd_aqlprofile"] ], args=["-I/opt/rocm/include"])
case "amd_gpu": return load("amd_gpu", [], [root/f"extra/hip_gpu_driver/{s}.h" for s in ["sdma_registers", "nvd", "gc_11_0_0_offset",
"sienna_cichlid_ip_offset"]],
args=["-I/opt/rocm/include", "-x", "c++"])
case "kgsl": return load("kgsl", [], [root/"extra/qcom_gpu_driver/msm_kgsl.h"], args=["-D__user="])
case "adreno": return load("adreno", [], [root/"extra/qcom_gpu_driver/a6xx.xml.h"])
case "qcom_dsp":
return load("qcom_dsp", [], [root/f"extra/dsp/include/{s}.h" for s in ["ion", "msm_ion", "adsprpc_shared", "remote_default", "apps_std"]])
case "sqtt": return load("sqtt", [], [root/"extra/sqtt/sqtt.h"])
case "rocprof":
return load("rocprof", ["find_library('rocprof-trace-decoder')", p:="'/usr/local/lib/rocprof-trace-decoder.so'", p.replace('so','dylib')],
[f"{{}}/include/{s}.h" for s in ["rocprof_trace_decoder", "trace_decoder_instrument", "trace_decoder_types"]],
tarball="https://github.com/ROCm/rocprof-trace-decoder/archive/dd0485100971522cc4cd8ae136bdda431061a04d.tar.gz")
case "mesa": return load("mesa", ["find_library('tinymesa_cpu')",
"(BASE:=os.getenv('MESA_PATH', f\"/usr{'/local/' if OSX else '/'}lib\"))+'/libtinymesa_cpu'+(EXT:='.dylib' if OSX else '.so')",
"f'{BASE}/libtinymesa{EXT}'", "'/opt/homebrew/lib/libtinymesa_cpu.dylib'", "'/opt/homebrew/lib/libtinymesa.dylib'"], [
*[f"{{}}/src/compiler/nir/{s}.h" for s in ["nir", "nir_builder", "nir_shader_compiler_options", "nir_serialize"]], "{}/gen/nir_intrinsics.h",
*[f"{{}}/src/nouveau/{s}.h" for s in ["headers/nv_device_info", "compiler/nak"]],
*[f"{{}}/src/gallium/auxiliary/gallivm/lp_bld{s}.h" for s in ["", "_passmgr", "_misc", "_type", "_init", "_nir", "_struct", "_jit_types",
"_flow", "_const"]],
"{}/src/compiler/glsl_types.h", "{}/src/util/blob.h", "{}/src/util/ralloc.h"], args=lambda:[
"-DHAVE_ENDIAN_H", "-DHAVE_STRUCT_TIMESPEC", "-DHAVE_PTHREAD", "-DHAVE_FUNC_ATTRIBUTE_PACKED", "-I{}/src", "-I{}/include", "-I{}/gen",
"-I{}/src/compiler/nir", "-I{}/src/gallium/auxiliary", "-I{}/src/gallium/include", f"-I{system('llvm-config-20 --includedir')}"],
preprocess=lambda path: subprocess.run("""mkdir -p gen/util/format
python3 src/util/format/u_format_table.py src/util/format/u_format.yaml --enums > gen/util/format/u_format_gen.h
python3 src/compiler/nir/nir_opcodes_h.py > gen/nir_opcodes.h
python3 src/compiler/nir/nir_intrinsics_h.py --outdir gen
python3 src/compiler/nir/nir_intrinsics_indices_h.py --outdir gen
python3 src/compiler/nir/nir_builder_opcodes_h.py > gen/nir_builder_opcodes.h
python3 src/compiler/nir/nir_intrinsics_h.py --outdir gen
python3 src/compiler/builtin_types_h.py gen/builtin_types.h""", cwd=path, shell=True, check=True),
tarball="https://gitlab.freedesktop.org/mesa/mesa/-/archive/mesa-25.2.4/mesa-25.2.4.tar.gz",
prolog=["import gzip, base64", "from tinygrad.helpers import OSX"], epilog=lambda path: [system(f"{root}/extra/mesa/lvp_nir_options.sh {path}")])
case _: raise AttributeError(f"no such autogen: {nm}")

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,23 @@
from tinygrad.runtime.autogen import load, root
am_src="https://github.com/ROCm/ROCK-Kernel-Driver/archive/ceb12c04e2b5b53ec0779362831f5ee40c4921e4.tar.gz"
AMD="{}/drivers/gpu/drm/amd"
inc = ["-include", "stdint.h"]
def __getattr__(nm):
match nm:
case "am": return load("am/am", [], [root/f"extra/amdpci/headers/{s}.h" for s in ["v11_structs", "v12_structs", "amdgpu_vm", "discovery",
"amdgpu_ucode", "psp_gfx_if", "amdgpu_psp", "amdgpu_irq", "amdgpu_doorbell"]]+[f"{AMD}/include/soc15_ih_clientid.h"], args=inc, tarball=am_src)
case "pm4_soc15": return load("am/pm4_soc15", [], [f"{AMD}/amdkfd/kfd_pm4_headers_ai.h", f"{AMD}/amdgpu/soc15d.h"], tarball=am_src)
case "pm4_nv": return load("am/pm4_nv", [], [f"{AMD}/amdkfd/kfd_pm4_headers_ai.h", f"{AMD}/amdgpu/nvd.h"], tarball=am_src)
case "sdma_4_0_0": return load("am/sdma_4_0_0", [], [root/"extra/hip_gpu_driver/sdma_registers.h", f"{AMD}/amdgpu/vega10_sdma_pkt_open.h"],
args=["-I/opt/rocm/include", "-x", "c++"], tarball=am_src),
case "sdma_5_0_0": return load("am/sdma_5_0_0", [], [root/"extra/hip_gpu_driver/sdma_registers.h", f"{AMD}/amdgpu/navi10_sdma_pkt_open.h"],
args=["-I/opt/rocm/include", "-x", "c++"], tarball=am_src),
case "sdma_6_0_0": return load("am/sdma_6_0_0", [], [root/"extra/hip_gpu_driver/sdma_registers.h", f"{AMD}//amdgpu/sdma_v6_0_0_pkt_open.h"],
args=["-I/opt/rocm/include", "-x", "c++"], tarball=am_src),
case "smu_v13_0_0": return load("am/smu_v13_0_0",[],[f"{AMD}/pm/swsmu/inc/pmfw_if/{s}.h" for s in ["smu_v13_0_0_ppsmc","smu13_driver_if_v13_0_0"]]
+[root/"extra/amdpci/headers/amdgpu_smu.h"], tarball=am_src),
case "smu_v14_0_2": return load("am/smu_v14_0_2", [], [f"{AMD}/pm/swsmu/inc/pmfw_if/{s}.h" for s in ["smu_v14_0_0_pmfw", "smu_v14_0_2_ppsmc",
"smu14_driver_if_v14_0"]]+[root/"extra/amdpci/headers/amdgpu_smu.h"], args=inc, tarball=am_src)
case _: raise AttributeError(f"no such autogen: {nm}")

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,89 @@
# mypy: ignore-errors
import ctypes
from tinygrad.helpers import unwrap
from tinygrad.runtime.support.c import Struct, CEnum, _IO, _IOW, _IOR, _IOWR
from ctypes.util import find_library
def dll():
try: return ctypes.CDLL(unwrap(find_library('nvJitLink')))
except: pass
return None
dll = dll()
nvJitLinkResult = CEnum(ctypes.c_uint32)
NVJITLINK_SUCCESS = nvJitLinkResult.define('NVJITLINK_SUCCESS', 0)
NVJITLINK_ERROR_UNRECOGNIZED_OPTION = nvJitLinkResult.define('NVJITLINK_ERROR_UNRECOGNIZED_OPTION', 1)
NVJITLINK_ERROR_MISSING_ARCH = nvJitLinkResult.define('NVJITLINK_ERROR_MISSING_ARCH', 2)
NVJITLINK_ERROR_INVALID_INPUT = nvJitLinkResult.define('NVJITLINK_ERROR_INVALID_INPUT', 3)
NVJITLINK_ERROR_PTX_COMPILE = nvJitLinkResult.define('NVJITLINK_ERROR_PTX_COMPILE', 4)
NVJITLINK_ERROR_NVVM_COMPILE = nvJitLinkResult.define('NVJITLINK_ERROR_NVVM_COMPILE', 5)
NVJITLINK_ERROR_INTERNAL = nvJitLinkResult.define('NVJITLINK_ERROR_INTERNAL', 6)
nvJitLinkInputType = CEnum(ctypes.c_uint32)
NVJITLINK_INPUT_NONE = nvJitLinkInputType.define('NVJITLINK_INPUT_NONE', 0)
NVJITLINK_INPUT_CUBIN = nvJitLinkInputType.define('NVJITLINK_INPUT_CUBIN', 1)
NVJITLINK_INPUT_PTX = nvJitLinkInputType.define('NVJITLINK_INPUT_PTX', 2)
NVJITLINK_INPUT_LTOIR = nvJitLinkInputType.define('NVJITLINK_INPUT_LTOIR', 3)
NVJITLINK_INPUT_FATBIN = nvJitLinkInputType.define('NVJITLINK_INPUT_FATBIN', 4)
NVJITLINK_INPUT_OBJECT = nvJitLinkInputType.define('NVJITLINK_INPUT_OBJECT', 5)
NVJITLINK_INPUT_LIBRARY = nvJitLinkInputType.define('NVJITLINK_INPUT_LIBRARY', 6)
class struct_nvJitLink(Struct): pass
nvJitLinkHandle = ctypes.POINTER(struct_nvJitLink)
uint32_t = ctypes.c_uint32
# nvJitLinkResult nvJitLinkCreate(nvJitLinkHandle *handle, uint32_t numOptions, const char **options)
try: (nvJitLinkCreate:=dll.nvJitLinkCreate).restype, nvJitLinkCreate.argtypes = nvJitLinkResult, [ctypes.POINTER(nvJitLinkHandle), uint32_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError: pass
# nvJitLinkResult nvJitLinkDestroy(nvJitLinkHandle *handle)
try: (nvJitLinkDestroy:=dll.nvJitLinkDestroy).restype, nvJitLinkDestroy.argtypes = nvJitLinkResult, [ctypes.POINTER(nvJitLinkHandle)]
except AttributeError: pass
size_t = ctypes.c_uint64
# nvJitLinkResult nvJitLinkAddData(nvJitLinkHandle handle, nvJitLinkInputType inputType, const void *data, size_t size, const char *name)
try: (nvJitLinkAddData:=dll.nvJitLinkAddData).restype, nvJitLinkAddData.argtypes = nvJitLinkResult, [nvJitLinkHandle, nvJitLinkInputType, ctypes.c_void_p, size_t, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkAddFile(nvJitLinkHandle handle, nvJitLinkInputType inputType, const char *fileName)
try: (nvJitLinkAddFile:=dll.nvJitLinkAddFile).restype, nvJitLinkAddFile.argtypes = nvJitLinkResult, [nvJitLinkHandle, nvJitLinkInputType, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkComplete(nvJitLinkHandle handle)
try: (nvJitLinkComplete:=dll.nvJitLinkComplete).restype, nvJitLinkComplete.argtypes = nvJitLinkResult, [nvJitLinkHandle]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetLinkedCubinSize(nvJitLinkHandle handle, size_t *size)
try: (nvJitLinkGetLinkedCubinSize:=dll.nvJitLinkGetLinkedCubinSize).restype, nvJitLinkGetLinkedCubinSize.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetLinkedCubin(nvJitLinkHandle handle, void *cubin)
try: (nvJitLinkGetLinkedCubin:=dll.nvJitLinkGetLinkedCubin).restype, nvJitLinkGetLinkedCubin.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.c_void_p]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetLinkedPtxSize(nvJitLinkHandle handle, size_t *size)
try: (nvJitLinkGetLinkedPtxSize:=dll.nvJitLinkGetLinkedPtxSize).restype, nvJitLinkGetLinkedPtxSize.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetLinkedPtx(nvJitLinkHandle handle, char *ptx)
try: (nvJitLinkGetLinkedPtx:=dll.nvJitLinkGetLinkedPtx).restype, nvJitLinkGetLinkedPtx.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetErrorLogSize(nvJitLinkHandle handle, size_t *size)
try: (nvJitLinkGetErrorLogSize:=dll.nvJitLinkGetErrorLogSize).restype, nvJitLinkGetErrorLogSize.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetErrorLog(nvJitLinkHandle handle, char *log)
try: (nvJitLinkGetErrorLog:=dll.nvJitLinkGetErrorLog).restype, nvJitLinkGetErrorLog.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetInfoLogSize(nvJitLinkHandle handle, size_t *size)
try: (nvJitLinkGetInfoLogSize:=dll.nvJitLinkGetInfoLogSize).restype, nvJitLinkGetInfoLogSize.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkGetInfoLog(nvJitLinkHandle handle, char *log)
try: (nvJitLinkGetInfoLog:=dll.nvJitLinkGetInfoLog).restype, nvJitLinkGetInfoLog.argtypes = nvJitLinkResult, [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvJitLinkResult nvJitLinkVersion(unsigned int *major, unsigned int *minor)
try: (nvJitLinkVersion:=dll.nvJitLinkVersion).restype, nvJitLinkVersion.argtypes = nvJitLinkResult, [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32)]
except AttributeError: pass

View file

@ -1,581 +1,113 @@
# mypy: ignore-errors
# -*- coding: utf-8 -*-
#
# TARGET arch is: []
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes, ctypes.util
import ctypes
from tinygrad.helpers import unwrap
from tinygrad.runtime.support.c import Struct, CEnum, _IO, _IOW, _IOR, _IOWR
from ctypes.util import find_library
def dll():
try: return ctypes.CDLL(unwrap(find_library('nvrtc')))
except: pass
return None
dll = dll()
nvrtcResult = CEnum(ctypes.c_uint32)
NVRTC_SUCCESS = nvrtcResult.define('NVRTC_SUCCESS', 0)
NVRTC_ERROR_OUT_OF_MEMORY = nvrtcResult.define('NVRTC_ERROR_OUT_OF_MEMORY', 1)
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = nvrtcResult.define('NVRTC_ERROR_PROGRAM_CREATION_FAILURE', 2)
NVRTC_ERROR_INVALID_INPUT = nvrtcResult.define('NVRTC_ERROR_INVALID_INPUT', 3)
NVRTC_ERROR_INVALID_PROGRAM = nvrtcResult.define('NVRTC_ERROR_INVALID_PROGRAM', 4)
NVRTC_ERROR_INVALID_OPTION = nvrtcResult.define('NVRTC_ERROR_INVALID_OPTION', 5)
NVRTC_ERROR_COMPILATION = nvrtcResult.define('NVRTC_ERROR_COMPILATION', 6)
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = nvrtcResult.define('NVRTC_ERROR_BUILTIN_OPERATION_FAILURE', 7)
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = nvrtcResult.define('NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION', 8)
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = nvrtcResult.define('NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION', 9)
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = nvrtcResult.define('NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID', 10)
NVRTC_ERROR_INTERNAL_ERROR = nvrtcResult.define('NVRTC_ERROR_INTERNAL_ERROR', 11)
_libraries = {}
_libraries['libnvrtc.so'] = ctypes.CDLL(ctypes.util.find_library('nvrtc'))
def string_cast(char_pointer, encoding='utf-8', errors='strict'):
value = ctypes.cast(char_pointer, ctypes.c_char_p).value
if value is not None and encoding is not None:
value = value.decode(encoding, errors=errors)
return value
# const char *nvrtcGetErrorString(nvrtcResult result)
try: (nvrtcGetErrorString:=dll.nvrtcGetErrorString).restype, nvrtcGetErrorString.argtypes = ctypes.POINTER(ctypes.c_char), [nvrtcResult]
except AttributeError: pass
# nvrtcResult nvrtcVersion(int *major, int *minor)
try: (nvrtcVersion:=dll.nvrtcVersion).restype, nvrtcVersion.argtypes = nvrtcResult, [ctypes.POINTER(ctypes.c_int32), ctypes.POINTER(ctypes.c_int32)]
except AttributeError: pass
def char_pointer_cast(string, encoding='utf-8'):
if encoding is not None:
try:
string = string.encode(encoding)
except AttributeError:
# In Python3, bytes has no encode attribute
pass
string = ctypes.c_char_p(string)
return ctypes.cast(string, ctypes.POINTER(ctypes.c_char))
# nvrtcResult nvrtcGetNumSupportedArchs(int *numArchs)
try: (nvrtcGetNumSupportedArchs:=dll.nvrtcGetNumSupportedArchs).restype, nvrtcGetNumSupportedArchs.argtypes = nvrtcResult, [ctypes.POINTER(ctypes.c_int32)]
except AttributeError: pass
# nvrtcResult nvrtcGetSupportedArchs(int *supportedArchs)
try: (nvrtcGetSupportedArchs:=dll.nvrtcGetSupportedArchs).restype, nvrtcGetSupportedArchs.argtypes = nvrtcResult, [ctypes.POINTER(ctypes.c_int32)]
except AttributeError: pass
class AsDictMixin:
import sys
if sys.version_info >= (3, 14): _layout_ = 'ms'
@classmethod
def as_dict(cls, self):
result = {}
if not isinstance(self, AsDictMixin):
# not a structure, assume it's already a python object
return self
if not hasattr(cls, "_fields_"):
return result
# sys.version_info >= (3, 5)
# for (field, *_) in cls._fields_: # noqa
for field_tuple in cls._fields_: # noqa
field = field_tuple[0]
if field.startswith('PADDING_'):
continue
value = getattr(self, field)
type_ = type(value)
if hasattr(value, "_length_") and hasattr(value, "_type_"):
# array
if not hasattr(type_, "as_dict"):
value = [v for v in value]
else:
type_ = type_._type_
value = [type_.as_dict(v) for v in value]
elif hasattr(value, "contents") and hasattr(value, "_type_"):
# pointer
try:
if not hasattr(type_, "as_dict"):
value = value.contents
else:
type_ = type_._type_
value = type_.as_dict(value.contents)
except ValueError:
# nullptr
value = None
elif isinstance(value, AsDictMixin):
# other structure
value = type_.as_dict(value)
result[field] = value
return result
class Structure(ctypes.Structure, AsDictMixin):
def __init__(self, *args, **kwds):
# We don't want to use positional arguments fill PADDING_* fields
args = dict(zip(self.__class__._field_names_(), args))
args.update(kwds)
super(Structure, self).__init__(**args)
@classmethod
def _field_names_(cls):
if hasattr(cls, '_fields_'):
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
else:
return ()
@classmethod
def get_type(cls, field):
for f in cls._fields_:
if f[0] == field:
return f[1]
return None
@classmethod
def bind(cls, bound_fields):
fields = {}
for name, type_ in cls._fields_:
if hasattr(type_, "restype"):
if name in bound_fields:
if bound_fields[name] is None:
fields[name] = type_()
else:
# use a closure to capture the callback from the loop scope
fields[name] = (
type_((lambda callback: lambda *args: callback(*args))(
bound_fields[name]))
)
del bound_fields[name]
else:
# default callback implementation (does nothing)
try:
default_ = type_(0).restype().value
except TypeError:
default_ = None
fields[name] = type_((
lambda default_: lambda *args: default_)(default_))
else:
# not a callback function, use default initialization
if name in bound_fields:
fields[name] = bound_fields[name]
del bound_fields[name]
else:
fields[name] = type_()
if len(bound_fields) != 0:
raise ValueError(
"Cannot bind the following unknown callback(s) {}.{}".format(
cls.__name__, bound_fields.keys()
))
return cls(**fields)
class Union(ctypes.Union, AsDictMixin):
pass
_libraries['libnvJitLink.so'] = ctypes.CDLL(ctypes.util.find_library('nvJitLink'))
c_int128 = ctypes.c_ubyte*16
c_uint128 = c_int128
void = None
if ctypes.sizeof(ctypes.c_longdouble) == 16:
c_long_double_t = ctypes.c_longdouble
else:
c_long_double_t = ctypes.c_ubyte*16
# values for enumeration 'c__EA_nvrtcResult'
c__EA_nvrtcResult__enumvalues = {
0: 'NVRTC_SUCCESS',
1: 'NVRTC_ERROR_OUT_OF_MEMORY',
2: 'NVRTC_ERROR_PROGRAM_CREATION_FAILURE',
3: 'NVRTC_ERROR_INVALID_INPUT',
4: 'NVRTC_ERROR_INVALID_PROGRAM',
5: 'NVRTC_ERROR_INVALID_OPTION',
6: 'NVRTC_ERROR_COMPILATION',
7: 'NVRTC_ERROR_BUILTIN_OPERATION_FAILURE',
8: 'NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION',
9: 'NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION',
10: 'NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID',
11: 'NVRTC_ERROR_INTERNAL_ERROR',
12: 'NVRTC_ERROR_TIME_FILE_WRITE_FAILED',
}
NVRTC_SUCCESS = 0
NVRTC_ERROR_OUT_OF_MEMORY = 1
NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
NVRTC_ERROR_INVALID_INPUT = 3
NVRTC_ERROR_INVALID_PROGRAM = 4
NVRTC_ERROR_INVALID_OPTION = 5
NVRTC_ERROR_COMPILATION = 6
NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
NVRTC_ERROR_INTERNAL_ERROR = 11
NVRTC_ERROR_TIME_FILE_WRITE_FAILED = 12
c__EA_nvrtcResult = ctypes.c_uint32 # enum
nvrtcResult = c__EA_nvrtcResult
nvrtcResult__enumvalues = c__EA_nvrtcResult__enumvalues
try:
nvrtcGetErrorString = _libraries['libnvrtc.so'].nvrtcGetErrorString
nvrtcGetErrorString.restype = ctypes.POINTER(ctypes.c_char)
nvrtcGetErrorString.argtypes = [nvrtcResult]
except AttributeError:
pass
try:
nvrtcVersion = _libraries['libnvrtc.so'].nvrtcVersion
nvrtcVersion.restype = nvrtcResult
nvrtcVersion.argtypes = [ctypes.POINTER(ctypes.c_int32), ctypes.POINTER(ctypes.c_int32)]
except AttributeError:
pass
try:
nvrtcGetNumSupportedArchs = _libraries['libnvrtc.so'].nvrtcGetNumSupportedArchs
nvrtcGetNumSupportedArchs.restype = nvrtcResult
nvrtcGetNumSupportedArchs.argtypes = [ctypes.POINTER(ctypes.c_int32)]
except AttributeError:
pass
try:
nvrtcGetSupportedArchs = _libraries['libnvrtc.so'].nvrtcGetSupportedArchs
nvrtcGetSupportedArchs.restype = nvrtcResult
nvrtcGetSupportedArchs.argtypes = [ctypes.POINTER(ctypes.c_int32)]
except AttributeError:
pass
class struct__nvrtcProgram(Structure):
pass
class struct__nvrtcProgram(Struct): pass
nvrtcProgram = ctypes.POINTER(struct__nvrtcProgram)
try:
nvrtcCreateProgram = _libraries['libnvrtc.so'].nvrtcCreateProgram
nvrtcCreateProgram.restype = nvrtcResult
nvrtcCreateProgram.argtypes = [ctypes.POINTER(ctypes.POINTER(struct__nvrtcProgram)), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.POINTER(ctypes.POINTER(ctypes.c_char)), ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError:
pass
try:
nvrtcDestroyProgram = _libraries['libnvrtc.so'].nvrtcDestroyProgram
nvrtcDestroyProgram.restype = nvrtcResult
nvrtcDestroyProgram.argtypes = [ctypes.POINTER(ctypes.POINTER(struct__nvrtcProgram))]
except AttributeError:
pass
try:
nvrtcCompileProgram = _libraries['libnvrtc.so'].nvrtcCompileProgram
nvrtcCompileProgram.restype = nvrtcResult
nvrtcCompileProgram.argtypes = [nvrtcProgram, ctypes.c_int32, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError:
pass
try:
nvrtcGetPTXSize = _libraries['libnvrtc.so'].nvrtcGetPTXSize
nvrtcGetPTXSize.restype = nvrtcResult
nvrtcGetPTXSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvrtcGetPTX = _libraries['libnvrtc.so'].nvrtcGetPTX
nvrtcGetPTX.restype = nvrtcResult
nvrtcGetPTX.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvrtcGetCUBINSize = _libraries['libnvrtc.so'].nvrtcGetCUBINSize
nvrtcGetCUBINSize.restype = nvrtcResult
nvrtcGetCUBINSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvrtcGetCUBIN = _libraries['libnvrtc.so'].nvrtcGetCUBIN
nvrtcGetCUBIN.restype = nvrtcResult
nvrtcGetCUBIN.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvrtcGetNVVMSize = _libraries['libnvrtc.so'].nvrtcGetNVVMSize
nvrtcGetNVVMSize.restype = nvrtcResult
nvrtcGetNVVMSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvrtcGetNVVM = _libraries['libnvrtc.so'].nvrtcGetNVVM
nvrtcGetNVVM.restype = nvrtcResult
nvrtcGetNVVM.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvrtcGetLTOIRSize = _libraries['libnvrtc.so'].nvrtcGetLTOIRSize
nvrtcGetLTOIRSize.restype = nvrtcResult
nvrtcGetLTOIRSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvrtcGetLTOIR = _libraries['libnvrtc.so'].nvrtcGetLTOIR
nvrtcGetLTOIR.restype = nvrtcResult
nvrtcGetLTOIR.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvrtcGetOptiXIRSize = _libraries['libnvrtc.so'].nvrtcGetOptiXIRSize
nvrtcGetOptiXIRSize.restype = nvrtcResult
nvrtcGetOptiXIRSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvrtcGetOptiXIR = _libraries['libnvrtc.so'].nvrtcGetOptiXIR
nvrtcGetOptiXIR.restype = nvrtcResult
nvrtcGetOptiXIR.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvrtcGetProgramLogSize = _libraries['libnvrtc.so'].nvrtcGetProgramLogSize
nvrtcGetProgramLogSize.restype = nvrtcResult
nvrtcGetProgramLogSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvrtcGetProgramLog = _libraries['libnvrtc.so'].nvrtcGetProgramLog
nvrtcGetProgramLog.restype = nvrtcResult
nvrtcGetProgramLog.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvrtcAddNameExpression = _libraries['libnvrtc.so'].nvrtcAddNameExpression
nvrtcAddNameExpression.restype = nvrtcResult
nvrtcAddNameExpression.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvrtcGetLoweredName = _libraries['libnvrtc.so'].nvrtcGetLoweredName
nvrtcGetLoweredName.restype = nvrtcResult
nvrtcGetLoweredName.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError:
pass
# nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog, const char *src, const char *name, int numHeaders, const char *const *headers, const char *const *includeNames)
try: (nvrtcCreateProgram:=dll.nvrtcCreateProgram).restype, nvrtcCreateProgram.argtypes = nvrtcResult, [ctypes.POINTER(nvrtcProgram), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.POINTER(ctypes.POINTER(ctypes.c_char)), ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError: pass
# values for enumeration 'c__EA_nvJitLinkResult'
c__EA_nvJitLinkResult__enumvalues = {
0: 'NVJITLINK_SUCCESS',
1: 'NVJITLINK_ERROR_UNRECOGNIZED_OPTION',
2: 'NVJITLINK_ERROR_MISSING_ARCH',
3: 'NVJITLINK_ERROR_INVALID_INPUT',
4: 'NVJITLINK_ERROR_PTX_COMPILE',
5: 'NVJITLINK_ERROR_NVVM_COMPILE',
6: 'NVJITLINK_ERROR_INTERNAL',
7: 'NVJITLINK_ERROR_THREADPOOL',
8: 'NVJITLINK_ERROR_UNRECOGNIZED_INPUT',
}
NVJITLINK_SUCCESS = 0
NVJITLINK_ERROR_UNRECOGNIZED_OPTION = 1
NVJITLINK_ERROR_MISSING_ARCH = 2
NVJITLINK_ERROR_INVALID_INPUT = 3
NVJITLINK_ERROR_PTX_COMPILE = 4
NVJITLINK_ERROR_NVVM_COMPILE = 5
NVJITLINK_ERROR_INTERNAL = 6
NVJITLINK_ERROR_THREADPOOL = 7
NVJITLINK_ERROR_UNRECOGNIZED_INPUT = 8
c__EA_nvJitLinkResult = ctypes.c_uint32 # enum
nvJitLinkResult = c__EA_nvJitLinkResult
nvJitLinkResult__enumvalues = c__EA_nvJitLinkResult__enumvalues
# nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog)
try: (nvrtcDestroyProgram:=dll.nvrtcDestroyProgram).restype, nvrtcDestroyProgram.argtypes = nvrtcResult, [ctypes.POINTER(nvrtcProgram)]
except AttributeError: pass
# values for enumeration 'c__EA_nvJitLinkInputType'
c__EA_nvJitLinkInputType__enumvalues = {
0: 'NVJITLINK_INPUT_NONE',
1: 'NVJITLINK_INPUT_CUBIN',
2: 'NVJITLINK_INPUT_PTX',
3: 'NVJITLINK_INPUT_LTOIR',
4: 'NVJITLINK_INPUT_FATBIN',
5: 'NVJITLINK_INPUT_OBJECT',
6: 'NVJITLINK_INPUT_LIBRARY',
10: 'NVJITLINK_INPUT_ANY',
}
NVJITLINK_INPUT_NONE = 0
NVJITLINK_INPUT_CUBIN = 1
NVJITLINK_INPUT_PTX = 2
NVJITLINK_INPUT_LTOIR = 3
NVJITLINK_INPUT_FATBIN = 4
NVJITLINK_INPUT_OBJECT = 5
NVJITLINK_INPUT_LIBRARY = 6
NVJITLINK_INPUT_ANY = 10
c__EA_nvJitLinkInputType = ctypes.c_uint32 # enum
nvJitLinkInputType = c__EA_nvJitLinkInputType
nvJitLinkInputType__enumvalues = c__EA_nvJitLinkInputType__enumvalues
class struct_nvJitLink(Structure):
pass
# nvrtcResult nvrtcCompileProgram(nvrtcProgram prog, int numOptions, const char *const *options)
try: (nvrtcCompileProgram:=dll.nvrtcCompileProgram).restype, nvrtcCompileProgram.argtypes = nvrtcResult, [nvrtcProgram, ctypes.c_int32, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError: pass
nvJitLinkHandle = ctypes.POINTER(struct_nvJitLink)
uint32_t = ctypes.c_uint32
try:
__nvJitLinkCreate_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkCreate_12_4
__nvJitLinkCreate_12_4.restype = nvJitLinkResult
__nvJitLinkCreate_12_4.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_nvJitLink)), uint32_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError:
pass
try:
nvJitLinkCreate = _libraries['libnvJitLink.so'].nvJitLinkCreate
nvJitLinkCreate.restype = nvJitLinkResult
nvJitLinkCreate.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_nvJitLink)), uint32_t, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError:
pass
try:
__nvJitLinkDestroy_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkDestroy_12_4
__nvJitLinkDestroy_12_4.restype = nvJitLinkResult
__nvJitLinkDestroy_12_4.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_nvJitLink))]
except AttributeError:
pass
try:
nvJitLinkDestroy = _libraries['libnvJitLink.so'].nvJitLinkDestroy
nvJitLinkDestroy.restype = nvJitLinkResult
nvJitLinkDestroy.argtypes = [ctypes.POINTER(ctypes.POINTER(struct_nvJitLink))]
except AttributeError:
pass
size_t = ctypes.c_uint64
try:
__nvJitLinkAddData_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkAddData_12_4
__nvJitLinkAddData_12_4.restype = nvJitLinkResult
__nvJitLinkAddData_12_4.argtypes = [nvJitLinkHandle, nvJitLinkInputType, ctypes.POINTER(None), size_t, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvJitLinkAddData = _libraries['libnvJitLink.so'].nvJitLinkAddData
nvJitLinkAddData.restype = nvJitLinkResult
nvJitLinkAddData.argtypes = [nvJitLinkHandle, nvJitLinkInputType, ctypes.POINTER(None), size_t, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
__nvJitLinkAddFile_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkAddFile_12_4
__nvJitLinkAddFile_12_4.restype = nvJitLinkResult
__nvJitLinkAddFile_12_4.argtypes = [nvJitLinkHandle, nvJitLinkInputType, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvJitLinkAddFile = _libraries['libnvJitLink.so'].nvJitLinkAddFile
nvJitLinkAddFile.restype = nvJitLinkResult
nvJitLinkAddFile.argtypes = [nvJitLinkHandle, nvJitLinkInputType, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
__nvJitLinkComplete_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkComplete_12_4
__nvJitLinkComplete_12_4.restype = nvJitLinkResult
__nvJitLinkComplete_12_4.argtypes = [nvJitLinkHandle]
except AttributeError:
pass
try:
nvJitLinkComplete = _libraries['libnvJitLink.so'].nvJitLinkComplete
nvJitLinkComplete.restype = nvJitLinkResult
nvJitLinkComplete.argtypes = [nvJitLinkHandle]
except AttributeError:
pass
try:
__nvJitLinkGetLinkedCubinSize_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetLinkedCubinSize_12_4
__nvJitLinkGetLinkedCubinSize_12_4.restype = nvJitLinkResult
__nvJitLinkGetLinkedCubinSize_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvJitLinkGetLinkedCubinSize = _libraries['libnvJitLink.so'].nvJitLinkGetLinkedCubinSize
nvJitLinkGetLinkedCubinSize.restype = nvJitLinkResult
nvJitLinkGetLinkedCubinSize.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
__nvJitLinkGetLinkedCubin_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetLinkedCubin_12_4
__nvJitLinkGetLinkedCubin_12_4.restype = nvJitLinkResult
__nvJitLinkGetLinkedCubin_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(None)]
except AttributeError:
pass
try:
nvJitLinkGetLinkedCubin = _libraries['libnvJitLink.so'].nvJitLinkGetLinkedCubin
nvJitLinkGetLinkedCubin.restype = nvJitLinkResult
nvJitLinkGetLinkedCubin.argtypes = [nvJitLinkHandle, ctypes.POINTER(None)]
except AttributeError:
pass
try:
__nvJitLinkGetLinkedPtxSize_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetLinkedPtxSize_12_4
__nvJitLinkGetLinkedPtxSize_12_4.restype = nvJitLinkResult
__nvJitLinkGetLinkedPtxSize_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvJitLinkGetLinkedPtxSize = _libraries['libnvJitLink.so'].nvJitLinkGetLinkedPtxSize
nvJitLinkGetLinkedPtxSize.restype = nvJitLinkResult
nvJitLinkGetLinkedPtxSize.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
__nvJitLinkGetLinkedPtx_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetLinkedPtx_12_4
__nvJitLinkGetLinkedPtx_12_4.restype = nvJitLinkResult
__nvJitLinkGetLinkedPtx_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvJitLinkGetLinkedPtx = _libraries['libnvJitLink.so'].nvJitLinkGetLinkedPtx
nvJitLinkGetLinkedPtx.restype = nvJitLinkResult
nvJitLinkGetLinkedPtx.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
__nvJitLinkGetErrorLogSize_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetErrorLogSize_12_4
__nvJitLinkGetErrorLogSize_12_4.restype = nvJitLinkResult
__nvJitLinkGetErrorLogSize_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvJitLinkGetErrorLogSize = _libraries['libnvJitLink.so'].nvJitLinkGetErrorLogSize
nvJitLinkGetErrorLogSize.restype = nvJitLinkResult
nvJitLinkGetErrorLogSize.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
__nvJitLinkGetErrorLog_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetErrorLog_12_4
__nvJitLinkGetErrorLog_12_4.restype = nvJitLinkResult
__nvJitLinkGetErrorLog_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvJitLinkGetErrorLog = _libraries['libnvJitLink.so'].nvJitLinkGetErrorLog
nvJitLinkGetErrorLog.restype = nvJitLinkResult
nvJitLinkGetErrorLog.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
__nvJitLinkGetInfoLogSize_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetInfoLogSize_12_4
__nvJitLinkGetInfoLogSize_12_4.restype = nvJitLinkResult
__nvJitLinkGetInfoLogSize_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
nvJitLinkGetInfoLogSize = _libraries['libnvJitLink.so'].nvJitLinkGetInfoLogSize
nvJitLinkGetInfoLogSize.restype = nvJitLinkResult
nvJitLinkGetInfoLogSize.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_uint64)]
except AttributeError:
pass
try:
__nvJitLinkGetInfoLog_12_4 = _libraries['libnvJitLink.so'].__nvJitLinkGetInfoLog_12_4
__nvJitLinkGetInfoLog_12_4.restype = nvJitLinkResult
__nvJitLinkGetInfoLog_12_4.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvJitLinkGetInfoLog = _libraries['libnvJitLink.so'].nvJitLinkGetInfoLog
nvJitLinkGetInfoLog.restype = nvJitLinkResult
nvJitLinkGetInfoLog.argtypes = [nvJitLinkHandle, ctypes.POINTER(ctypes.c_char)]
except AttributeError:
pass
try:
nvJitLinkVersion = _libraries['libnvJitLink.so'].nvJitLinkVersion
nvJitLinkVersion.restype = nvJitLinkResult
nvJitLinkVersion.argtypes = [ctypes.POINTER(ctypes.c_uint32), ctypes.POINTER(ctypes.c_uint32)]
except AttributeError:
pass
__all__ = \
['NVJITLINK_ERROR_INTERNAL', 'NVJITLINK_ERROR_INVALID_INPUT',
'NVJITLINK_ERROR_MISSING_ARCH', 'NVJITLINK_ERROR_NVVM_COMPILE',
'NVJITLINK_ERROR_PTX_COMPILE', 'NVJITLINK_ERROR_THREADPOOL',
'NVJITLINK_ERROR_UNRECOGNIZED_INPUT',
'NVJITLINK_ERROR_UNRECOGNIZED_OPTION', 'NVJITLINK_INPUT_ANY',
'NVJITLINK_INPUT_CUBIN', 'NVJITLINK_INPUT_FATBIN',
'NVJITLINK_INPUT_LIBRARY', 'NVJITLINK_INPUT_LTOIR',
'NVJITLINK_INPUT_NONE', 'NVJITLINK_INPUT_OBJECT',
'NVJITLINK_INPUT_PTX', 'NVJITLINK_SUCCESS',
'NVRTC_ERROR_BUILTIN_OPERATION_FAILURE',
'NVRTC_ERROR_COMPILATION', 'NVRTC_ERROR_INTERNAL_ERROR',
'NVRTC_ERROR_INVALID_INPUT', 'NVRTC_ERROR_INVALID_OPTION',
'NVRTC_ERROR_INVALID_PROGRAM',
'NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID',
'NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION',
'NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION',
'NVRTC_ERROR_OUT_OF_MEMORY',
'NVRTC_ERROR_PROGRAM_CREATION_FAILURE',
'NVRTC_ERROR_TIME_FILE_WRITE_FAILED', 'NVRTC_SUCCESS',
'__nvJitLinkAddData_12_4', '__nvJitLinkAddFile_12_4',
'__nvJitLinkComplete_12_4', '__nvJitLinkCreate_12_4',
'__nvJitLinkDestroy_12_4', '__nvJitLinkGetErrorLogSize_12_4',
'__nvJitLinkGetErrorLog_12_4', '__nvJitLinkGetInfoLogSize_12_4',
'__nvJitLinkGetInfoLog_12_4',
'__nvJitLinkGetLinkedCubinSize_12_4',
'__nvJitLinkGetLinkedCubin_12_4',
'__nvJitLinkGetLinkedPtxSize_12_4',
'__nvJitLinkGetLinkedPtx_12_4', 'c__EA_nvJitLinkInputType',
'c__EA_nvJitLinkResult', 'c__EA_nvrtcResult', 'nvJitLinkAddData',
'nvJitLinkAddFile', 'nvJitLinkComplete', 'nvJitLinkCreate',
'nvJitLinkDestroy', 'nvJitLinkGetErrorLog',
'nvJitLinkGetErrorLogSize', 'nvJitLinkGetInfoLog',
'nvJitLinkGetInfoLogSize', 'nvJitLinkGetLinkedCubin',
'nvJitLinkGetLinkedCubinSize', 'nvJitLinkGetLinkedPtx',
'nvJitLinkGetLinkedPtxSize', 'nvJitLinkHandle',
'nvJitLinkInputType', 'nvJitLinkInputType__enumvalues',
'nvJitLinkResult', 'nvJitLinkResult__enumvalues',
'nvJitLinkVersion', 'nvrtcAddNameExpression',
'nvrtcCompileProgram', 'nvrtcCreateProgram',
'nvrtcDestroyProgram', 'nvrtcGetCUBIN', 'nvrtcGetCUBINSize',
'nvrtcGetErrorString', 'nvrtcGetLTOIR', 'nvrtcGetLTOIRSize',
'nvrtcGetLoweredName', 'nvrtcGetNVVM', 'nvrtcGetNVVMSize',
'nvrtcGetNumSupportedArchs', 'nvrtcGetOptiXIR',
'nvrtcGetOptiXIRSize', 'nvrtcGetPTX', 'nvrtcGetPTXSize',
'nvrtcGetProgramLog', 'nvrtcGetProgramLogSize',
'nvrtcGetSupportedArchs', 'nvrtcProgram', 'nvrtcResult',
'nvrtcResult__enumvalues', 'nvrtcVersion', 'size_t',
'struct__nvrtcProgram', 'struct_nvJitLink', 'uint32_t']
# nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet)
try: (nvrtcGetPTXSize:=dll.nvrtcGetPTXSize).restype, nvrtcGetPTXSize.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx)
try: (nvrtcGetPTX:=dll.nvrtcGetPTX).restype, nvrtcGetPTX.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet)
try: (nvrtcGetCUBINSize:=dll.nvrtcGetCUBINSize).restype, nvrtcGetCUBINSize.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin)
try: (nvrtcGetCUBIN:=dll.nvrtcGetCUBIN).restype, nvrtcGetCUBIN.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# __attribute__((deprecated("This function will be removed in a future release. Please use nvrtcGetLTOIRSize instead"))) nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet)
try: (nvrtcGetNVVMSize:=dll.nvrtcGetNVVMSize).restype, nvrtcGetNVVMSize.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(size_t)]
except AttributeError: pass
# __attribute__((deprecated("This function will be removed in a future release. Please use nvrtcGetLTOIR instead"))) nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm)
try: (nvrtcGetNVVM:=dll.nvrtcGetNVVM).restype, nvrtcGetNVVM.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *LTOIRSizeRet)
try: (nvrtcGetLTOIRSize:=dll.nvrtcGetLTOIRSize).restype, nvrtcGetLTOIRSize.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *LTOIR)
try: (nvrtcGetLTOIR:=dll.nvrtcGetLTOIR).restype, nvrtcGetLTOIR.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvrtcResult nvrtcGetOptiXIRSize(nvrtcProgram prog, size_t *optixirSizeRet)
try: (nvrtcGetOptiXIRSize:=dll.nvrtcGetOptiXIRSize).restype, nvrtcGetOptiXIRSize.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvrtcResult nvrtcGetOptiXIR(nvrtcProgram prog, char *optixir)
try: (nvrtcGetOptiXIR:=dll.nvrtcGetOptiXIR).restype, nvrtcGetOptiXIR.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet)
try: (nvrtcGetProgramLogSize:=dll.nvrtcGetProgramLogSize).restype, nvrtcGetProgramLogSize.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(size_t)]
except AttributeError: pass
# nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log)
try: (nvrtcGetProgramLog:=dll.nvrtcGetProgramLog).restype, nvrtcGetProgramLog.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog, const char *const name_expression)
try: (nvrtcAddNameExpression:=dll.nvrtcAddNameExpression).restype, nvrtcAddNameExpression.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
except AttributeError: pass
# nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog, const char *const name_expression, const char **lowered_name)
try: (nvrtcGetLoweredName:=dll.nvrtcGetLoweredName).restype, nvrtcGetLoweredName.argtypes = nvrtcResult, [nvrtcProgram, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
except AttributeError: pass
__DEPRECATED__ = lambda msg: __attribute__((deprecated(msg)))

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,669 +1,260 @@
# pylint: skip-file
# mypy: ignore-errors
# -*- coding: utf-8 -*-
#
# TARGET arch is: []
# WORD_SIZE is: 8
# POINTER_SIZE is: 8
# LONGDOUBLE_SIZE is: 16
#
import ctypes, ctypes.util
PATHS_TO_TRY = [
'/usr/local/lib/librocprof-trace-decoder.so',
'/usr/local/lib/librocprof-trace-decoder.dylib',
]
def _try_dlopen_rocprof_trace_decoder():
library = ctypes.util.find_library("rocprof-trace-decoder")
if library:
try: return ctypes.CDLL(library)
except OSError: pass
for candidate in PATHS_TO_TRY:
try: return ctypes.CDLL(candidate)
except OSError: pass
import ctypes
from tinygrad.helpers import unwrap
from tinygrad.runtime.support.c import Struct, CEnum, _IO, _IOW, _IOR, _IOWR
from ctypes.util import find_library
def dll():
try: return ctypes.CDLL(unwrap(find_library('rocprof-trace-decoder')))
except: pass
try: return ctypes.CDLL(unwrap('/usr/local/lib/rocprof-trace-decoder.so'))
except: pass
try: return ctypes.CDLL(unwrap('/usr/local/lib/rocprof-trace-decoder.dylib'))
except: pass
return None
dll = dll()
rocprofiler_thread_trace_decoder_status_t = CEnum(ctypes.c_uint32)
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS = rocprofiler_thread_trace_decoder_status_t.define('ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS', 0)
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR = rocprofiler_thread_trace_decoder_status_t.define('ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR', 1)
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES = rocprofiler_thread_trace_decoder_status_t.define('ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES', 2)
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT = rocprofiler_thread_trace_decoder_status_t.define('ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT', 3)
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_SHADER_DATA = rocprofiler_thread_trace_decoder_status_t.define('ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_SHADER_DATA', 4)
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_LAST = rocprofiler_thread_trace_decoder_status_t.define('ROCPROFILER_THREAD_TRACE_DECODER_STATUS_LAST', 5)
class AsDictMixin:
@classmethod
def as_dict(cls, self):
result = {}
if not isinstance(self, AsDictMixin):
# not a structure, assume it's already a python object
return self
if not hasattr(cls, "_fields_"):
return result
# sys.version_info >= (3, 5)
# for (field, *_) in cls._fields_: # noqa
for field_tuple in cls._fields_: # noqa
field = field_tuple[0]
if field.startswith('PADDING_'):
continue
value = getattr(self, field)
type_ = type(value)
if hasattr(value, "_length_") and hasattr(value, "_type_"):
# array
if not hasattr(type_, "as_dict"):
value = [v for v in value]
else:
type_ = type_._type_
value = [type_.as_dict(v) for v in value]
elif hasattr(value, "contents") and hasattr(value, "_type_"):
# pointer
try:
if not hasattr(type_, "as_dict"):
value = value.contents
else:
type_ = type_._type_
value = type_.as_dict(value.contents)
except ValueError:
# nullptr
value = None
elif isinstance(value, AsDictMixin):
# other structure
value = type_.as_dict(value)
result[field] = value
return result
enum_rocprofiler_thread_trace_decoder_record_type_t = CEnum(ctypes.c_uint32)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_GFXIP = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_GFXIP', 0)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY', 1)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_PERFEVENT = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_PERFEVENT', 2)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE', 3)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_INFO = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_INFO', 4)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_DEBUG = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_DEBUG', 5)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_SHADERDATA = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_SHADERDATA', 6)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_REALTIME = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_REALTIME', 7)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_RT_FREQUENCY = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_RT_FREQUENCY', 8)
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_LAST = enum_rocprofiler_thread_trace_decoder_record_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_RECORD_LAST', 9)
class Structure(ctypes.Structure, AsDictMixin):
def __init__(self, *args, **kwds):
# We don't want to use positional arguments fill PADDING_* fields
args = dict(zip(self.__class__._field_names_(), args))
args.update(kwds)
super(Structure, self).__init__(**args)
@classmethod
def _field_names_(cls):
if hasattr(cls, '_fields_'):
return (f[0] for f in cls._fields_ if not f[0].startswith('PADDING'))
else:
return ()
@classmethod
def get_type(cls, field):
for f in cls._fields_:
if f[0] == field:
return f[1]
return None
@classmethod
def bind(cls, bound_fields):
fields = {}
for name, type_ in cls._fields_:
if hasattr(type_, "restype"):
if name in bound_fields:
if bound_fields[name] is None:
fields[name] = type_()
else:
# use a closure to capture the callback from the loop scope
fields[name] = (
type_((lambda callback: lambda *args: callback(*args))(
bound_fields[name]))
)
del bound_fields[name]
else:
# default callback implementation (does nothing)
try:
default_ = type_(0).restype().value
except TypeError:
default_ = None
fields[name] = type_((
lambda default_: lambda *args: default_)(default_))
else:
# not a callback function, use default initialization
if name in bound_fields:
fields[name] = bound_fields[name]
del bound_fields[name]
else:
fields[name] = type_()
if len(bound_fields) != 0:
raise ValueError(
"Cannot bind the following unknown callback(s) {}.{}".format(
cls.__name__, bound_fields.keys()
))
return cls(**fields)
class Union(ctypes.Union, AsDictMixin):
pass
c_int128 = ctypes.c_ubyte*16
c_uint128 = c_int128
void = None
if ctypes.sizeof(ctypes.c_longdouble) == 16:
c_long_double_t = ctypes.c_longdouble
else:
c_long_double_t = ctypes.c_ubyte*16
def string_cast(char_pointer, encoding='utf-8', errors='strict'):
value = ctypes.cast(char_pointer, ctypes.c_char_p).value
if value is not None and encoding is not None:
value = value.decode(encoding, errors=errors)
return value
def char_pointer_cast(string, encoding='utf-8'):
if encoding is not None:
try:
string = string.encode(encoding)
except AttributeError:
# In Python3, bytes has no encode attribute
pass
string = ctypes.c_char_p(string)
return ctypes.cast(string, ctypes.POINTER(ctypes.c_char))
class FunctionFactoryStub:
def __getattr__(self, _):
return ctypes.CFUNCTYPE(lambda y:y)
# libraries['FIXME_STUB'] explanation
# As you did not list (-l libraryname.so) a library that exports this function
# This is a non-working stub instead.
# You can either re-run clan2py with -l /path/to/library.so
# Or manually fix this by comment the ctypes.CDLL loading
_libraries = {}
_libraries['FIXME_STUB'] = _try_dlopen_rocprof_trace_decoder() # ctypes.CDLL('FIXME_STUB')
# values for enumeration 'rocprofiler_thread_trace_decoder_info_t'
rocprofiler_thread_trace_decoder_info_t__enumvalues = {
0: 'ROCPROFILER_THREAD_TRACE_DECODER_INFO_NONE',
1: 'ROCPROFILER_THREAD_TRACE_DECODER_INFO_DATA_LOST',
2: 'ROCPROFILER_THREAD_TRACE_DECODER_INFO_STITCH_INCOMPLETE',
3: 'ROCPROFILER_THREAD_TRACE_DECODER_INFO_WAVE_INCOMPLETE',
4: 'ROCPROFILER_THREAD_TRACE_DECODER_INFO_LAST',
}
ROCPROFILER_THREAD_TRACE_DECODER_INFO_NONE = 0
ROCPROFILER_THREAD_TRACE_DECODER_INFO_DATA_LOST = 1
ROCPROFILER_THREAD_TRACE_DECODER_INFO_STITCH_INCOMPLETE = 2
ROCPROFILER_THREAD_TRACE_DECODER_INFO_WAVE_INCOMPLETE = 3
ROCPROFILER_THREAD_TRACE_DECODER_INFO_LAST = 4
rocprofiler_thread_trace_decoder_info_t = ctypes.c_uint32 # enum
class struct_rocprofiler_thread_trace_decoder_pc_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_pc_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_pc_t._fields_ = [
('address', ctypes.c_uint64),
('code_object_id', ctypes.c_uint64),
]
rocprofiler_thread_trace_decoder_pc_t = struct_rocprofiler_thread_trace_decoder_pc_t
class struct_rocprofiler_thread_trace_decoder_perfevent_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_perfevent_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_perfevent_t._fields_ = [
('time', ctypes.c_int64),
('events0', ctypes.c_uint16),
('events1', ctypes.c_uint16),
('events2', ctypes.c_uint16),
('events3', ctypes.c_uint16),
('CU', ctypes.c_ubyte),
('bank', ctypes.c_ubyte),
('PADDING_0', ctypes.c_ubyte * 6),
]
rocprofiler_thread_trace_decoder_perfevent_t = struct_rocprofiler_thread_trace_decoder_perfevent_t
class struct_rocprofiler_thread_trace_decoder_occupancy_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_occupancy_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_occupancy_t._fields_ = [
('pc', rocprofiler_thread_trace_decoder_pc_t),
('time', ctypes.c_uint64),
('reserved', ctypes.c_ubyte),
('cu', ctypes.c_ubyte),
('simd', ctypes.c_ubyte),
('wave_id', ctypes.c_ubyte),
('start', ctypes.c_uint32, 1),
('_rsvd', ctypes.c_uint32, 31),
]
rocprofiler_thread_trace_decoder_occupancy_t = struct_rocprofiler_thread_trace_decoder_occupancy_t
# values for enumeration 'rocprofiler_thread_trace_decoder_wstate_type_t'
rocprofiler_thread_trace_decoder_wstate_type_t__enumvalues = {
0: 'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EMPTY',
1: 'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_IDLE',
2: 'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EXEC',
3: 'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_WAIT',
4: 'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_STALL',
5: 'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_LAST',
}
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EMPTY = 0
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_IDLE = 1
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EXEC = 2
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_WAIT = 3
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_STALL = 4
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_LAST = 5
rocprofiler_thread_trace_decoder_wstate_type_t = ctypes.c_uint32 # enum
class struct_rocprofiler_thread_trace_decoder_wave_state_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_wave_state_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_wave_state_t._fields_ = [
('type', ctypes.c_int32),
('duration', ctypes.c_int32),
]
rocprofiler_thread_trace_decoder_wave_state_t = struct_rocprofiler_thread_trace_decoder_wave_state_t
# values for enumeration 'rocprofiler_thread_trace_decoder_inst_category_t'
rocprofiler_thread_trace_decoder_inst_category_t__enumvalues = {
0: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_NONE',
1: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_SMEM',
2: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_SALU',
3: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_VMEM',
4: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_FLAT',
5: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_LDS',
6: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_VALU',
7: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_JUMP',
8: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_NEXT',
9: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_IMMED',
10: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_CONTEXT',
11: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_MESSAGE',
12: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_BVH',
13: 'ROCPROFILER_THREAD_TRACE_DECODER_INST_LAST',
}
ROCPROFILER_THREAD_TRACE_DECODER_INST_NONE = 0
ROCPROFILER_THREAD_TRACE_DECODER_INST_SMEM = 1
ROCPROFILER_THREAD_TRACE_DECODER_INST_SALU = 2
ROCPROFILER_THREAD_TRACE_DECODER_INST_VMEM = 3
ROCPROFILER_THREAD_TRACE_DECODER_INST_FLAT = 4
ROCPROFILER_THREAD_TRACE_DECODER_INST_LDS = 5
ROCPROFILER_THREAD_TRACE_DECODER_INST_VALU = 6
ROCPROFILER_THREAD_TRACE_DECODER_INST_JUMP = 7
ROCPROFILER_THREAD_TRACE_DECODER_INST_NEXT = 8
ROCPROFILER_THREAD_TRACE_DECODER_INST_IMMED = 9
ROCPROFILER_THREAD_TRACE_DECODER_INST_CONTEXT = 10
ROCPROFILER_THREAD_TRACE_DECODER_INST_MESSAGE = 11
ROCPROFILER_THREAD_TRACE_DECODER_INST_BVH = 12
ROCPROFILER_THREAD_TRACE_DECODER_INST_LAST = 13
rocprofiler_thread_trace_decoder_inst_category_t = ctypes.c_uint32 # enum
class struct_rocprofiler_thread_trace_decoder_inst_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_inst_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_inst_t._fields_ = [
('category', ctypes.c_uint32, 8),
('stall', ctypes.c_uint32, 24),
('duration', ctypes.c_int32),
('time', ctypes.c_int64),
('pc', rocprofiler_thread_trace_decoder_pc_t),
]
rocprofiler_thread_trace_decoder_inst_t = struct_rocprofiler_thread_trace_decoder_inst_t
class struct_rocprofiler_thread_trace_decoder_wave_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_wave_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_wave_t._fields_ = [
('cu', ctypes.c_ubyte),
('simd', ctypes.c_ubyte),
('wave_id', ctypes.c_ubyte),
('contexts', ctypes.c_ubyte),
('_rsvd1', ctypes.c_uint32),
('_rsvd2', ctypes.c_uint32),
('_rsvd3', ctypes.c_uint32),
('begin_time', ctypes.c_int64),
('end_time', ctypes.c_int64),
('timeline_size', ctypes.c_uint64),
('instructions_size', ctypes.c_uint64),
('timeline_array', ctypes.POINTER(struct_rocprofiler_thread_trace_decoder_wave_state_t)),
('instructions_array', ctypes.POINTER(struct_rocprofiler_thread_trace_decoder_inst_t)),
]
rocprofiler_thread_trace_decoder_wave_t = struct_rocprofiler_thread_trace_decoder_wave_t
class struct_rocprofiler_thread_trace_decoder_realtime_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_realtime_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_realtime_t._fields_ = [
('shader_clock', ctypes.c_int64),
('realtime_clock', ctypes.c_uint64),
('reserved', ctypes.c_uint64),
]
rocprofiler_thread_trace_decoder_realtime_t = struct_rocprofiler_thread_trace_decoder_realtime_t
# values for enumeration 'rocprofiler_thread_trace_decoder_shaderdata_flags_t'
rocprofiler_thread_trace_decoder_shaderdata_flags_t__enumvalues = {
0: 'ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_IMM',
1: 'ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_PRIV',
}
ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_IMM = 0
ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_PRIV = 1
rocprofiler_thread_trace_decoder_shaderdata_flags_t = ctypes.c_uint32 # enum
class struct_rocprofiler_thread_trace_decoder_shaderdata_t(Structure):
pass
struct_rocprofiler_thread_trace_decoder_shaderdata_t._pack_ = 1 # source:False
struct_rocprofiler_thread_trace_decoder_shaderdata_t._fields_ = [
('time', ctypes.c_int64),
('value', ctypes.c_uint64),
('cu', ctypes.c_ubyte),
('simd', ctypes.c_ubyte),
('wave_id', ctypes.c_ubyte),
('flags', ctypes.c_ubyte),
('reserved', ctypes.c_uint32),
]
rocprofiler_thread_trace_decoder_shaderdata_t = struct_rocprofiler_thread_trace_decoder_shaderdata_t
# values for enumeration 'rocprofiler_thread_trace_decoder_record_type_t'
rocprofiler_thread_trace_decoder_record_type_t__enumvalues = {
0: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_GFXIP',
1: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY',
2: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_PERFEVENT',
3: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE',
4: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_INFO',
5: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_DEBUG',
6: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_SHADERDATA',
7: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_REALTIME',
8: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_RT_FREQUENCY',
9: 'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_LAST',
}
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_GFXIP = 0
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY = 1
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_PERFEVENT = 2
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE = 3
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_INFO = 4
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_DEBUG = 5
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_SHADERDATA = 6
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_REALTIME = 7
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_RT_FREQUENCY = 8
ROCPROFILER_THREAD_TRACE_DECODER_RECORD_LAST = 9
rocprofiler_thread_trace_decoder_record_type_t = ctypes.c_uint32 # enum
# values for enumeration 'c__EA_rocprofiler_thread_trace_decoder_status_t'
c__EA_rocprofiler_thread_trace_decoder_status_t__enumvalues = {
0: 'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS',
1: 'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR',
2: 'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES',
3: 'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT',
4: 'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_SHADER_DATA',
5: 'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_LAST',
}
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS = 0
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR = 1
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES = 2
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT = 3
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_SHADER_DATA = 4
ROCPROFILER_THREAD_TRACE_DECODER_STATUS_LAST = 5
c__EA_rocprofiler_thread_trace_decoder_status_t = ctypes.c_uint32 # enum
rocprofiler_thread_trace_decoder_status_t = c__EA_rocprofiler_thread_trace_decoder_status_t
rocprofiler_thread_trace_decoder_status_t__enumvalues = c__EA_rocprofiler_thread_trace_decoder_status_t__enumvalues
rocprof_trace_decoder_trace_callback_t = ctypes.CFUNCTYPE(c__EA_rocprofiler_thread_trace_decoder_status_t, rocprofiler_thread_trace_decoder_record_type_t, ctypes.POINTER(None), ctypes.c_uint64, ctypes.POINTER(None))
rocprof_trace_decoder_isa_callback_t = ctypes.CFUNCTYPE(c__EA_rocprofiler_thread_trace_decoder_status_t, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), struct_rocprofiler_thread_trace_decoder_pc_t, ctypes.POINTER(None))
rocprof_trace_decoder_se_data_callback_t = ctypes.CFUNCTYPE(ctypes.c_uint64, ctypes.POINTER(ctypes.POINTER(ctypes.c_ubyte)), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(None))
try:
rocprof_trace_decoder_parse_data = _libraries['FIXME_STUB'].rocprof_trace_decoder_parse_data
rocprof_trace_decoder_parse_data.restype = rocprofiler_thread_trace_decoder_status_t
rocprof_trace_decoder_parse_data.argtypes = [rocprof_trace_decoder_se_data_callback_t, rocprof_trace_decoder_trace_callback_t, rocprof_trace_decoder_isa_callback_t, ctypes.POINTER(None)]
except AttributeError:
pass
try:
rocprof_trace_decoder_get_info_string = _libraries['FIXME_STUB'].rocprof_trace_decoder_get_info_string
rocprof_trace_decoder_get_info_string.restype = ctypes.POINTER(ctypes.c_char)
rocprof_trace_decoder_get_info_string.argtypes = [rocprofiler_thread_trace_decoder_info_t]
except AttributeError:
pass
try:
rocprof_trace_decoder_get_status_string = _libraries['FIXME_STUB'].rocprof_trace_decoder_get_status_string
rocprof_trace_decoder_get_status_string.restype = ctypes.POINTER(ctypes.c_char)
rocprof_trace_decoder_get_status_string.argtypes = [rocprofiler_thread_trace_decoder_status_t]
except AttributeError:
pass
rocprofiler_thread_trace_decoder_debug_callback_t = ctypes.CFUNCTYPE(None, ctypes.c_int64, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(None))
rocprof_trace_decoder_trace_callback_t = ctypes.CFUNCTYPE(rocprofiler_thread_trace_decoder_status_t, enum_rocprofiler_thread_trace_decoder_record_type_t, ctypes.c_void_p, ctypes.c_uint64, ctypes.c_void_p)
class struct_rocprofiler_thread_trace_decoder_pc_t(Struct): pass
uint64_t = ctypes.c_uint64
try:
rocprof_trace_decoder_dump_data = _libraries['FIXME_STUB'].rocprof_trace_decoder_dump_data
rocprof_trace_decoder_dump_data.restype = rocprofiler_thread_trace_decoder_status_t
rocprof_trace_decoder_dump_data.argtypes = [ctypes.POINTER(ctypes.c_char), uint64_t, rocprofiler_thread_trace_decoder_debug_callback_t, ctypes.POINTER(None)]
except AttributeError:
pass
class union_rocprof_trace_decoder_gfx9_header_t(Union):
pass
class struct_rocprof_trace_decoder_gfx9_header_t_0(Structure):
pass
struct_rocprof_trace_decoder_gfx9_header_t_0._pack_ = 1 # source:False
struct_rocprof_trace_decoder_gfx9_header_t_0._fields_ = [
('legacy_version', ctypes.c_uint64, 13),
('gfx9_version2', ctypes.c_uint64, 3),
('DSIMDM', ctypes.c_uint64, 4),
('DCU', ctypes.c_uint64, 5),
('reserved1', ctypes.c_uint64, 1),
('SEID', ctypes.c_uint64, 6),
('reserved2', ctypes.c_uint64, 32),
struct_rocprofiler_thread_trace_decoder_pc_t._fields_ = [
('address', uint64_t),
('code_object_id', uint64_t),
]
rocprof_trace_decoder_isa_callback_t = ctypes.CFUNCTYPE(rocprofiler_thread_trace_decoder_status_t, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_uint64), ctypes.POINTER(ctypes.c_uint64), struct_rocprofiler_thread_trace_decoder_pc_t, ctypes.c_void_p)
rocprof_trace_decoder_se_data_callback_t = ctypes.CFUNCTYPE(ctypes.c_uint64, ctypes.POINTER(ctypes.POINTER(ctypes.c_ubyte)), ctypes.POINTER(ctypes.c_uint64), ctypes.c_void_p)
# rocprofiler_thread_trace_decoder_status_t rocprof_trace_decoder_parse_data(rocprof_trace_decoder_se_data_callback_t se_data_callback, rocprof_trace_decoder_trace_callback_t trace_callback, rocprof_trace_decoder_isa_callback_t isa_callback, void *userdata)
try: (rocprof_trace_decoder_parse_data:=dll.rocprof_trace_decoder_parse_data).restype, rocprof_trace_decoder_parse_data.argtypes = rocprofiler_thread_trace_decoder_status_t, [rocprof_trace_decoder_se_data_callback_t, rocprof_trace_decoder_trace_callback_t, rocprof_trace_decoder_isa_callback_t, ctypes.c_void_p]
except AttributeError: pass
union_rocprof_trace_decoder_gfx9_header_t._pack_ = 1 # source:False
union_rocprof_trace_decoder_gfx9_header_t._anonymous_ = ('_0',)
enum_rocprofiler_thread_trace_decoder_info_t = CEnum(ctypes.c_uint32)
ROCPROFILER_THREAD_TRACE_DECODER_INFO_NONE = enum_rocprofiler_thread_trace_decoder_info_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INFO_NONE', 0)
ROCPROFILER_THREAD_TRACE_DECODER_INFO_DATA_LOST = enum_rocprofiler_thread_trace_decoder_info_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INFO_DATA_LOST', 1)
ROCPROFILER_THREAD_TRACE_DECODER_INFO_STITCH_INCOMPLETE = enum_rocprofiler_thread_trace_decoder_info_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INFO_STITCH_INCOMPLETE', 2)
ROCPROFILER_THREAD_TRACE_DECODER_INFO_WAVE_INCOMPLETE = enum_rocprofiler_thread_trace_decoder_info_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INFO_WAVE_INCOMPLETE', 3)
ROCPROFILER_THREAD_TRACE_DECODER_INFO_LAST = enum_rocprofiler_thread_trace_decoder_info_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INFO_LAST', 4)
rocprofiler_thread_trace_decoder_info_t = enum_rocprofiler_thread_trace_decoder_info_t
# const char *rocprof_trace_decoder_get_info_string(rocprofiler_thread_trace_decoder_info_t info)
try: (rocprof_trace_decoder_get_info_string:=dll.rocprof_trace_decoder_get_info_string).restype, rocprof_trace_decoder_get_info_string.argtypes = ctypes.POINTER(ctypes.c_char), [rocprofiler_thread_trace_decoder_info_t]
except AttributeError: pass
# const char *rocprof_trace_decoder_get_status_string(rocprofiler_thread_trace_decoder_status_t status)
try: (rocprof_trace_decoder_get_status_string:=dll.rocprof_trace_decoder_get_status_string).restype, rocprof_trace_decoder_get_status_string.argtypes = ctypes.POINTER(ctypes.c_char), [rocprofiler_thread_trace_decoder_status_t]
except AttributeError: pass
rocprofiler_thread_trace_decoder_debug_callback_t = ctypes.CFUNCTYPE(None, ctypes.c_int64, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_void_p)
# rocprofiler_thread_trace_decoder_status_t rocprof_trace_decoder_dump_data(const char *data, uint64_t data_size, rocprofiler_thread_trace_decoder_debug_callback_t cb, void *userdata)
try: (rocprof_trace_decoder_dump_data:=dll.rocprof_trace_decoder_dump_data).restype, rocprof_trace_decoder_dump_data.argtypes = rocprofiler_thread_trace_decoder_status_t, [ctypes.POINTER(ctypes.c_char), uint64_t, rocprofiler_thread_trace_decoder_debug_callback_t, ctypes.c_void_p]
except AttributeError: pass
class union_rocprof_trace_decoder_gfx9_header_t(ctypes.Union): pass
class union_rocprof_trace_decoder_gfx9_header_t_0(Struct): pass
union_rocprof_trace_decoder_gfx9_header_t_0._fields_ = [
('legacy_version', uint64_t,13),
('gfx9_version2', uint64_t,3),
('DSIMDM', uint64_t,4),
('DCU', uint64_t,5),
('reserved1', uint64_t,1),
('SEID', uint64_t,6),
('reserved2', uint64_t,32),
]
union_rocprof_trace_decoder_gfx9_header_t._anonymous_ = ['_0']
union_rocprof_trace_decoder_gfx9_header_t._fields_ = [
('_0', struct_rocprof_trace_decoder_gfx9_header_t_0),
('raw', ctypes.c_uint64),
('_0', union_rocprof_trace_decoder_gfx9_header_t_0),
('raw', uint64_t),
]
rocprof_trace_decoder_gfx9_header_t = union_rocprof_trace_decoder_gfx9_header_t
class union_rocprof_trace_decoder_instrument_enable_t(Union):
pass
class struct_rocprof_trace_decoder_instrument_enable_t_0(Structure):
pass
struct_rocprof_trace_decoder_instrument_enable_t_0._pack_ = 1 # source:False
struct_rocprof_trace_decoder_instrument_enable_t_0._fields_ = [
('char1', ctypes.c_uint32, 8),
('char2', ctypes.c_uint32, 8),
('char3', ctypes.c_uint32, 8),
('char4', ctypes.c_uint32, 8),
class union_rocprof_trace_decoder_instrument_enable_t(ctypes.Union): pass
class union_rocprof_trace_decoder_instrument_enable_t_0(Struct): pass
union_rocprof_trace_decoder_instrument_enable_t_0._fields_ = [
('char1', ctypes.c_uint32,8),
('char2', ctypes.c_uint32,8),
('char3', ctypes.c_uint32,8),
('char4', ctypes.c_uint32,8),
]
union_rocprof_trace_decoder_instrument_enable_t._pack_ = 1 # source:False
union_rocprof_trace_decoder_instrument_enable_t._anonymous_ = ('_0',)
union_rocprof_trace_decoder_instrument_enable_t._anonymous_ = ['_0']
union_rocprof_trace_decoder_instrument_enable_t._fields_ = [
('_0', struct_rocprof_trace_decoder_instrument_enable_t_0),
('u32All', ctypes.c_uint32),
('_0', union_rocprof_trace_decoder_instrument_enable_t_0),
('u32All', ctypes.c_uint32),
]
rocprof_trace_decoder_instrument_enable_t = union_rocprof_trace_decoder_instrument_enable_t
class union_rocprof_trace_decoder_packet_header_t(Union):
pass
class struct_rocprof_trace_decoder_packet_header_t_0(Structure):
pass
struct_rocprof_trace_decoder_packet_header_t_0._pack_ = 1 # source:False
struct_rocprof_trace_decoder_packet_header_t_0._fields_ = [
('opcode', ctypes.c_uint32, 8),
('type', ctypes.c_uint32, 4),
('data20', ctypes.c_uint32, 20),
class union_rocprof_trace_decoder_packet_header_t(ctypes.Union): pass
class union_rocprof_trace_decoder_packet_header_t_0(Struct): pass
union_rocprof_trace_decoder_packet_header_t_0._fields_ = [
('opcode', ctypes.c_uint32,8),
('type', ctypes.c_uint32,4),
('data20', ctypes.c_uint32,20),
]
union_rocprof_trace_decoder_packet_header_t._pack_ = 1 # source:False
union_rocprof_trace_decoder_packet_header_t._anonymous_ = ('_0',)
union_rocprof_trace_decoder_packet_header_t._anonymous_ = ['_0']
union_rocprof_trace_decoder_packet_header_t._fields_ = [
('_0', struct_rocprof_trace_decoder_packet_header_t_0),
('u32All', ctypes.c_uint32),
('_0', union_rocprof_trace_decoder_packet_header_t_0),
('u32All', ctypes.c_uint32),
]
rocprof_trace_decoder_packet_header_t = union_rocprof_trace_decoder_packet_header_t
enum_rocprof_trace_decoder_packet_opcode_t = CEnum(ctypes.c_uint32)
ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ = enum_rocprof_trace_decoder_packet_opcode_t.define('ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ', 4)
ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP = enum_rocprof_trace_decoder_packet_opcode_t.define('ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP', 5)
ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO = enum_rocprof_trace_decoder_packet_opcode_t.define('ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO', 6)
# values for enumeration 'rocprof_trace_decoder_packet_opcode_t'
rocprof_trace_decoder_packet_opcode_t__enumvalues = {
4: 'ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ',
5: 'ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP',
6: 'ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO',
}
ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ = 4
ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP = 5
ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO = 6
rocprof_trace_decoder_packet_opcode_t = ctypes.c_uint32 # enum
rocprof_trace_decoder_packet_opcode_t = enum_rocprof_trace_decoder_packet_opcode_t
enum_rocprof_trace_decoder_agent_info_type_t = CEnum(ctypes.c_uint32)
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ = enum_rocprof_trace_decoder_agent_info_type_t.define('ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ', 0)
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL = enum_rocprof_trace_decoder_agent_info_type_t.define('ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL', 1)
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_LAST = enum_rocprof_trace_decoder_agent_info_type_t.define('ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_LAST', 2)
# values for enumeration 'rocprof_trace_decoder_agent_info_type_t'
rocprof_trace_decoder_agent_info_type_t__enumvalues = {
0: 'ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ',
1: 'ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL',
2: 'ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_LAST',
}
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ = 0
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL = 1
ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_LAST = 2
rocprof_trace_decoder_agent_info_type_t = ctypes.c_uint32 # enum
class union_rocprof_trace_decoder_codeobj_marker_tail_t(Union):
pass
class struct_rocprof_trace_decoder_codeobj_marker_tail_t_0(Structure):
pass
struct_rocprof_trace_decoder_codeobj_marker_tail_t_0._pack_ = 1 # source:False
struct_rocprof_trace_decoder_codeobj_marker_tail_t_0._fields_ = [
('isUnload', ctypes.c_uint32, 1),
('bFromStart', ctypes.c_uint32, 1),
('legacy_id', ctypes.c_uint32, 30),
rocprof_trace_decoder_agent_info_type_t = enum_rocprof_trace_decoder_agent_info_type_t
class union_rocprof_trace_decoder_codeobj_marker_tail_t(ctypes.Union): pass
class union_rocprof_trace_decoder_codeobj_marker_tail_t_0(Struct): pass
uint32_t = ctypes.c_uint32
union_rocprof_trace_decoder_codeobj_marker_tail_t_0._fields_ = [
('isUnload', uint32_t,1),
('bFromStart', uint32_t,1),
('legacy_id', uint32_t,30),
]
union_rocprof_trace_decoder_codeobj_marker_tail_t._pack_ = 1 # source:False
union_rocprof_trace_decoder_codeobj_marker_tail_t._anonymous_ = ('_0',)
union_rocprof_trace_decoder_codeobj_marker_tail_t._anonymous_ = ['_0']
union_rocprof_trace_decoder_codeobj_marker_tail_t._fields_ = [
('_0', struct_rocprof_trace_decoder_codeobj_marker_tail_t_0),
('raw', ctypes.c_uint32),
('_0', union_rocprof_trace_decoder_codeobj_marker_tail_t_0),
('raw', uint32_t),
]
rocprof_trace_decoder_codeobj_marker_tail_t = union_rocprof_trace_decoder_codeobj_marker_tail_t
enum_rocprof_trace_decoder_codeobj_marker_type_t = CEnum(ctypes.c_uint32)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL', 0)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO', 1)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO', 2)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI', 3)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI', 4)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO', 5)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI', 6)
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_LAST = enum_rocprof_trace_decoder_codeobj_marker_type_t.define('ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_LAST', 7)
# values for enumeration 'rocprof_trace_decoder_codeobj_marker_type_t'
rocprof_trace_decoder_codeobj_marker_type_t__enumvalues = {
0: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL',
1: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO',
2: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO',
3: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI',
4: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI',
5: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO',
6: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI',
7: 'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_LAST',
}
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL = 0
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO = 1
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO = 2
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI = 3
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI = 4
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO = 5
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI = 6
ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_LAST = 7
rocprof_trace_decoder_codeobj_marker_type_t = ctypes.c_uint32 # enum
__all__ = \
['ROCPROFILER_THREAD_TRACE_DECODER_INFO_DATA_LOST',
'ROCPROFILER_THREAD_TRACE_DECODER_INFO_LAST',
'ROCPROFILER_THREAD_TRACE_DECODER_INFO_NONE',
'ROCPROFILER_THREAD_TRACE_DECODER_INFO_STITCH_INCOMPLETE',
'ROCPROFILER_THREAD_TRACE_DECODER_INFO_WAVE_INCOMPLETE',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_BVH',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_CONTEXT',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_FLAT',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_IMMED',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_JUMP',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_LAST',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_LDS',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_MESSAGE',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_NEXT',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_NONE',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_SALU',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_SMEM',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_VALU',
'ROCPROFILER_THREAD_TRACE_DECODER_INST_VMEM',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_DEBUG',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_GFXIP',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_INFO',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_LAST',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_OCCUPANCY',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_PERFEVENT',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_REALTIME',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_RT_FREQUENCY',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_SHADERDATA',
'ROCPROFILER_THREAD_TRACE_DECODER_RECORD_WAVE',
'ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_IMM',
'ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_PRIV',
'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR',
'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_ARGUMENT',
'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_INVALID_SHADER_DATA',
'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_ERROR_OUT_OF_RESOURCES',
'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_LAST',
'ROCPROFILER_THREAD_TRACE_DECODER_STATUS_SUCCESS',
'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EMPTY',
'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EXEC',
'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_IDLE',
'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_LAST',
'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_STALL',
'ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_WAIT',
'ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_COUNTER_INTERVAL',
'ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_LAST',
'ROCPROF_TRACE_DECODER_AGENT_INFO_TYPE_RT_FREQUENCY_KHZ',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_HI',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ADDR_LO',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_HI',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_ID_LO',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_LAST',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_HI',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_SIZE_LO',
'ROCPROF_TRACE_DECODER_CODEOBJ_MARKER_TYPE_TAIL',
'ROCPROF_TRACE_DECODER_PACKET_OPCODE_AGENT_INFO',
'ROCPROF_TRACE_DECODER_PACKET_OPCODE_CODEOBJ',
'ROCPROF_TRACE_DECODER_PACKET_OPCODE_RT_TIMESTAMP',
'c__EA_rocprofiler_thread_trace_decoder_status_t',
'rocprof_trace_decoder_agent_info_type_t',
'rocprof_trace_decoder_codeobj_marker_tail_t',
'rocprof_trace_decoder_codeobj_marker_type_t',
'rocprof_trace_decoder_dump_data',
'rocprof_trace_decoder_get_info_string',
'rocprof_trace_decoder_get_status_string',
'rocprof_trace_decoder_gfx9_header_t',
'rocprof_trace_decoder_instrument_enable_t',
'rocprof_trace_decoder_isa_callback_t',
'rocprof_trace_decoder_packet_header_t',
'rocprof_trace_decoder_packet_opcode_t',
'rocprof_trace_decoder_parse_data',
'rocprof_trace_decoder_se_data_callback_t',
'rocprof_trace_decoder_trace_callback_t',
'rocprofiler_thread_trace_decoder_debug_callback_t',
'rocprofiler_thread_trace_decoder_info_t',
'rocprofiler_thread_trace_decoder_inst_category_t',
'rocprofiler_thread_trace_decoder_inst_t',
'rocprofiler_thread_trace_decoder_occupancy_t',
'rocprofiler_thread_trace_decoder_pc_t',
'rocprofiler_thread_trace_decoder_perfevent_t',
'rocprofiler_thread_trace_decoder_realtime_t',
'rocprofiler_thread_trace_decoder_record_type_t',
'rocprofiler_thread_trace_decoder_shaderdata_flags_t',
'rocprofiler_thread_trace_decoder_shaderdata_t',
'rocprofiler_thread_trace_decoder_status_t',
'rocprofiler_thread_trace_decoder_status_t__enumvalues',
'rocprofiler_thread_trace_decoder_wave_state_t',
'rocprofiler_thread_trace_decoder_wave_t',
'rocprofiler_thread_trace_decoder_wstate_type_t',
'struct_rocprof_trace_decoder_codeobj_marker_tail_t_0',
'struct_rocprof_trace_decoder_gfx9_header_t_0',
'struct_rocprof_trace_decoder_instrument_enable_t_0',
'struct_rocprof_trace_decoder_packet_header_t_0',
'struct_rocprofiler_thread_trace_decoder_inst_t',
'struct_rocprofiler_thread_trace_decoder_occupancy_t',
'struct_rocprofiler_thread_trace_decoder_pc_t',
'struct_rocprofiler_thread_trace_decoder_perfevent_t',
'struct_rocprofiler_thread_trace_decoder_realtime_t',
'struct_rocprofiler_thread_trace_decoder_shaderdata_t',
'struct_rocprofiler_thread_trace_decoder_wave_state_t',
'struct_rocprofiler_thread_trace_decoder_wave_t', 'uint64_t',
'union_rocprof_trace_decoder_codeobj_marker_tail_t',
'union_rocprof_trace_decoder_gfx9_header_t',
'union_rocprof_trace_decoder_instrument_enable_t',
'union_rocprof_trace_decoder_packet_header_t']
rocprof_trace_decoder_codeobj_marker_type_t = enum_rocprof_trace_decoder_codeobj_marker_type_t
rocprofiler_thread_trace_decoder_pc_t = struct_rocprofiler_thread_trace_decoder_pc_t
class struct_rocprofiler_thread_trace_decoder_perfevent_t(Struct): pass
int64_t = ctypes.c_int64
uint16_t = ctypes.c_uint16
uint8_t = ctypes.c_ubyte
struct_rocprofiler_thread_trace_decoder_perfevent_t._fields_ = [
('time', int64_t),
('events0', uint16_t),
('events1', uint16_t),
('events2', uint16_t),
('events3', uint16_t),
('CU', uint8_t),
('bank', uint8_t),
]
rocprofiler_thread_trace_decoder_perfevent_t = struct_rocprofiler_thread_trace_decoder_perfevent_t
class struct_rocprofiler_thread_trace_decoder_occupancy_t(Struct): pass
struct_rocprofiler_thread_trace_decoder_occupancy_t._fields_ = [
('pc', rocprofiler_thread_trace_decoder_pc_t),
('time', uint64_t),
('reserved', uint8_t),
('cu', uint8_t),
('simd', uint8_t),
('wave_id', uint8_t),
('start', uint32_t,1),
('_rsvd', uint32_t,31),
]
rocprofiler_thread_trace_decoder_occupancy_t = struct_rocprofiler_thread_trace_decoder_occupancy_t
enum_rocprofiler_thread_trace_decoder_wstate_type_t = CEnum(ctypes.c_uint32)
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EMPTY = enum_rocprofiler_thread_trace_decoder_wstate_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EMPTY', 0)
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_IDLE = enum_rocprofiler_thread_trace_decoder_wstate_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_IDLE', 1)
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EXEC = enum_rocprofiler_thread_trace_decoder_wstate_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_EXEC', 2)
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_WAIT = enum_rocprofiler_thread_trace_decoder_wstate_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_WAIT', 3)
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_STALL = enum_rocprofiler_thread_trace_decoder_wstate_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_STALL', 4)
ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_LAST = enum_rocprofiler_thread_trace_decoder_wstate_type_t.define('ROCPROFILER_THREAD_TRACE_DECODER_WSTATE_LAST', 5)
rocprofiler_thread_trace_decoder_wstate_type_t = enum_rocprofiler_thread_trace_decoder_wstate_type_t
class struct_rocprofiler_thread_trace_decoder_wave_state_t(Struct): pass
int32_t = ctypes.c_int32
struct_rocprofiler_thread_trace_decoder_wave_state_t._fields_ = [
('type', int32_t),
('duration', int32_t),
]
rocprofiler_thread_trace_decoder_wave_state_t = struct_rocprofiler_thread_trace_decoder_wave_state_t
enum_rocprofiler_thread_trace_decoder_inst_category_t = CEnum(ctypes.c_uint32)
ROCPROFILER_THREAD_TRACE_DECODER_INST_NONE = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_NONE', 0)
ROCPROFILER_THREAD_TRACE_DECODER_INST_SMEM = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_SMEM', 1)
ROCPROFILER_THREAD_TRACE_DECODER_INST_SALU = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_SALU', 2)
ROCPROFILER_THREAD_TRACE_DECODER_INST_VMEM = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_VMEM', 3)
ROCPROFILER_THREAD_TRACE_DECODER_INST_FLAT = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_FLAT', 4)
ROCPROFILER_THREAD_TRACE_DECODER_INST_LDS = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_LDS', 5)
ROCPROFILER_THREAD_TRACE_DECODER_INST_VALU = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_VALU', 6)
ROCPROFILER_THREAD_TRACE_DECODER_INST_JUMP = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_JUMP', 7)
ROCPROFILER_THREAD_TRACE_DECODER_INST_NEXT = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_NEXT', 8)
ROCPROFILER_THREAD_TRACE_DECODER_INST_IMMED = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_IMMED', 9)
ROCPROFILER_THREAD_TRACE_DECODER_INST_CONTEXT = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_CONTEXT', 10)
ROCPROFILER_THREAD_TRACE_DECODER_INST_MESSAGE = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_MESSAGE', 11)
ROCPROFILER_THREAD_TRACE_DECODER_INST_BVH = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_BVH', 12)
ROCPROFILER_THREAD_TRACE_DECODER_INST_LAST = enum_rocprofiler_thread_trace_decoder_inst_category_t.define('ROCPROFILER_THREAD_TRACE_DECODER_INST_LAST', 13)
rocprofiler_thread_trace_decoder_inst_category_t = enum_rocprofiler_thread_trace_decoder_inst_category_t
class struct_rocprofiler_thread_trace_decoder_inst_t(Struct): pass
struct_rocprofiler_thread_trace_decoder_inst_t._fields_ = [
('category', uint32_t,8),
('stall', uint32_t,24),
('duration', int32_t),
('time', int64_t),
('pc', rocprofiler_thread_trace_decoder_pc_t),
]
rocprofiler_thread_trace_decoder_inst_t = struct_rocprofiler_thread_trace_decoder_inst_t
class struct_rocprofiler_thread_trace_decoder_wave_t(Struct): pass
struct_rocprofiler_thread_trace_decoder_wave_t._fields_ = [
('cu', uint8_t),
('simd', uint8_t),
('wave_id', uint8_t),
('contexts', uint8_t),
('_rsvd1', uint32_t),
('_rsvd2', uint32_t),
('_rsvd3', uint32_t),
('begin_time', int64_t),
('end_time', int64_t),
('timeline_size', uint64_t),
('instructions_size', uint64_t),
('timeline_array', ctypes.POINTER(rocprofiler_thread_trace_decoder_wave_state_t)),
('instructions_array', ctypes.POINTER(rocprofiler_thread_trace_decoder_inst_t)),
]
rocprofiler_thread_trace_decoder_wave_t = struct_rocprofiler_thread_trace_decoder_wave_t
class struct_rocprofiler_thread_trace_decoder_realtime_t(Struct): pass
struct_rocprofiler_thread_trace_decoder_realtime_t._fields_ = [
('shader_clock', int64_t),
('realtime_clock', uint64_t),
('reserved', uint64_t),
]
rocprofiler_thread_trace_decoder_realtime_t = struct_rocprofiler_thread_trace_decoder_realtime_t
enum_rocprofiler_thread_trace_decoder_shaderdata_flags_t = CEnum(ctypes.c_uint32)
ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_IMM = enum_rocprofiler_thread_trace_decoder_shaderdata_flags_t.define('ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_IMM', 0)
ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_PRIV = enum_rocprofiler_thread_trace_decoder_shaderdata_flags_t.define('ROCPROFILER_THREAD_TRACE_DECODER_SHADERDATA_FLAGS_PRIV', 1)
rocprofiler_thread_trace_decoder_shaderdata_flags_t = enum_rocprofiler_thread_trace_decoder_shaderdata_flags_t
class struct_rocprofiler_thread_trace_decoder_shaderdata_t(Struct): pass
struct_rocprofiler_thread_trace_decoder_shaderdata_t._fields_ = [
('time', int64_t),
('value', uint64_t),
('cu', uint8_t),
('simd', uint8_t),
('wave_id', uint8_t),
('flags', uint8_t),
('reserved', uint32_t),
]
rocprofiler_thread_trace_decoder_shaderdata_t = struct_rocprofiler_thread_trace_decoder_shaderdata_t
rocprofiler_thread_trace_decoder_record_type_t = enum_rocprofiler_thread_trace_decoder_record_type_t

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -188,14 +188,9 @@ class AMDComputeQueue(HWQueue):
### SQTT ###
def sqtt_setup_exec(self, prg, global_size):
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(
_0=sqtt.union_rgp_sqtt_marker_pipeline_bind_0(_0=sqtt.struct_rgp_sqtt_marker_pipeline_bind_0_0(
identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE, bind_point=(__BIND_POINT_COMPUTE:=1))),
_1=sqtt.union_rgp_sqtt_marker_pipeline_bind_1(api_pso_hash=data64_le(prg.libhash[0]))))
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(
_0=sqtt.union_rgp_sqtt_marker_event_0(_0=sqtt.struct_rgp_sqtt_marker_event_0_0(has_thread_dims=1)),
_2=sqtt.union_rgp_sqtt_marker_event_2(cmd_id=next(prg.dev.sqtt_next_cmd_id))), *global_size)
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_pipeline_bind(identifier=sqtt.RGP_SQTT_MARKER_IDENTIFIER_BIND_PIPELINE,
bind_point=(__BIND_POINT_COMPUTE:=1), api_pso_hash=data64_le(prg.libhash[0])))
self.sqtt_userdata(sqtt.struct_rgp_sqtt_marker_event(has_thread_dims=1, cmd_id=next(prg.dev.sqtt_next_cmd_id)), *global_size)
se_cap = max(prod([x if isinstance(x, int) else 1 for x in global_size]) // 4, 1) // 32
for xcc in range(self.dev.xccs):

View file

@ -11,7 +11,7 @@ if MOCKGPU:=getenv("MOCKGPU"): from test.mockgpu.cuda import cuda # type: ignore
def check(status):
if status != 0:
error = ctypes.string_at(init_c_var(ctypes.POINTER(ctypes.c_char)(), lambda x: cuda.cuGetErrorString(status, ctypes.byref(x)))).decode()
error = ctypes.string_at(init_c_var(ctypes.c_char_p(), lambda x: cuda.cuGetErrorString(status, x))).decode()
raise RuntimeError(f"CUDA Error {status}, {error}")
def encode_args(args, vals) -> tuple[ctypes.Structure, ctypes.Array]:
@ -42,7 +42,7 @@ class CUDAProgram:
status = cuda.cuModuleLoadData(ctypes.byref(self.module), lib)
if status != 0:
del self.module
raise RuntimeError(f"module load failed with status code {status}: {cuda.cudaError_enum__enumvalues[status]}")
raise RuntimeError(f"module load failed with status code {status}: {cuda.CUresult.get(status)}")
check(cuda.cuModuleGetFunction(ctypes.byref(prg := cuda.CUfunction()), self.module, name.encode("utf-8")))
self.prg = prg
if self.smem > 0: check(cuda.cuFuncSetAttribute(self.prg, cuda.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, self.smem))

View file

@ -36,7 +36,7 @@ def uvm_ioctl(cmd, sttyp, fd:FileIOInterface, **kwargs):
def make_uvm_type():
return type("NVUVM", (object,), {name.replace("UVM_", "").lower(): functools.partial(uvm_ioctl, dt, getattr(nv_gpu, name+"_PARAMS"))
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
for name,dt in nv_gpu.__dict__.items() if name.startswith("UVM_") and nv_gpu.__dict__.get(name+"_PARAMS")})
uvm = make_uvm_type()
class QMD:
@ -436,7 +436,7 @@ class NVKIface:
def _gpu_uvm_map(self, va_base, size, mem_handle, create_range=True, has_cpu_mapping=False) -> HCQBuffer:
if create_range: uvm.create_external_range(self.fd_uvm, base=va_base, length=size)
attrs = (nv_gpu.struct_c__SA_UvmGpuMappingAttributes*256)(nv_gpu.struct_c__SA_UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
attrs = (nv_gpu.UvmGpuMappingAttributes*256)(nv_gpu.UvmGpuMappingAttributes(gpuUuid=self.gpu_uuid, gpuMappingType=1))
# NOTE: va_addr is set to make rawbufs compatible with HCQBuffer protocol.
return HCQBuffer(va_base, size, meta=uvm.map_external_allocation(self.fd_uvm, base=va_base, length=size, rmCtrlFd=self.fd_ctl.fd,

View file

@ -321,13 +321,15 @@ class QCOMAllocator(HCQAllocatorBase):
self.dev.synchronize()
self.dev._gpu_free(opaque)
def flag(nm, val): return (val << getattr(kgsl, f"{nm}_SHIFT")) & getattr(kgsl, f"{nm}_MASK")
class QCOMDevice(HCQCompiled):
def __init__(self, device:str=""):
self.fd = FileIOInterface('/dev/kgsl-3d0', os.O_RDWR)
self.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
| kgsl.KGSL_CONTEXT_PRIORITY(getenv("QCOM_PRIORITY", 8)) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
| flag("KGSL_CONTEXT_PRIORITY", getenv("QCOM_PRIORITY", 8)) | flag("KGSL_CONTEXT_PREEMPT_STYLE", kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id
self.cmd_buf = self._gpu_alloc(16 << 20)
@ -357,8 +359,8 @@ class QCOMDevice(HCQCompiled):
functools.partial(QCOMComputeQueue, self), None)
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
flags |= flag("KGSL_MEMALIGN", alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
if uncached: flags |= flag("KGSL_CACHEMODE", kgsl.KGSL_CACHEMODE_UNCACHED)
alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=(bosz:=round_up(size, 1<<alignment_hint)), flags=flags, mmapsize=bosz)
va_addr = self.fd.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, alloc.id * 0x1000)

View file

@ -10,7 +10,7 @@ import os
WGPUDevPtr: TypeAlias = webgpu.WGPUDevice # type: ignore
WGPUBufPtr: TypeAlias = webgpu.WGPUBuffer # type: ignore
backend_types = {v: k for k, v in webgpu.WGPUBackendType__enumvalues.items() }
backend_types = {v: k for k, v in webgpu.enum_WGPUBackendType.items() }
instance = webgpu.wgpuCreateInstance(webgpu.WGPUInstanceDescriptor(features = webgpu.WGPUInstanceFeatures(timedWaitAnyEnable = True)))
@ -38,7 +38,7 @@ def _run(async_fun, cb_info_type, cb_type, status_enum, res_idx:int|None, msg_id
cb_info = cb_info_type(nextInChain=None, mode=webgpu.WGPUCallbackMode_WaitAnyOnly, callback=cb_type(cb))
_wait(async_fun(*params, cb_info))
if result[0] != 1: raise RuntimeError(f"[{status_enum[result[0]] if status_enum else 'ERROR'}]{result[msg_idx] if msg_idx else ''}")
if result[0] != 1: raise RuntimeError(f"[{status_enum.get(result[0]) if status_enum else 'ERROR'}]{result[msg_idx] if msg_idx else ''}")
return result[res_idx] if res_idx else None
def copy_buffer_to_buffer(dev:WGPUDevPtr, src:WGPUBufPtr, src_offset:int, dst:WGPUBufPtr, dst_offset:int, size:int):
@ -54,8 +54,8 @@ def read_buffer(dev:WGPUDevPtr, buf:WGPUBufPtr) -> memoryview:
tmp_buffer = webgpu.wgpuDeviceCreateBuffer(dev, webgpu.WGPUBufferDescriptor(size=size,
usage=webgpu.WGPUBufferUsage_CopyDst | webgpu.WGPUBufferUsage_MapRead, mappedAtCreation=False))
copy_buffer_to_buffer(dev, buf, 0, tmp_buffer, 0, size)
_run(webgpu.wgpuBufferMapAsync2, webgpu.WGPUBufferMapCallbackInfo2, webgpu.WGPUBufferMapCallback2, webgpu.WGPUBufferMapAsyncStatus__enumvalues,
None, 0, tmp_buffer, webgpu.WGPUMapMode_Read, 0, size)
_run(webgpu.wgpuBufferMapAsync2, webgpu.WGPUBufferMapCallbackInfo2, webgpu.WGPUBufferMapCallback2, webgpu.WGPUBufferMapAsyncStatus, None, 0,
tmp_buffer, webgpu.WGPUMapMode_Read, 0, size)
void_ptr = ctypes.cast(webgpu.wgpuBufferGetConstMappedRange(tmp_buffer, 0, size), ctypes.c_void_p)
buf_copy = bytearray((ctypes.c_uint8 * size).from_address(void_ptr.value))
webgpu.wgpuBufferUnmap(tmp_buffer)
@ -140,7 +140,7 @@ class WebGPUProgram:
compute_desc = webgpu.WGPUComputePipelineDescriptor(layout=pipeline_layout,
compute=webgpu.WGPUComputeState(module=self.prg, entryPoint=to_wgpu_str(self.name)))
pipeline_result = _run(webgpu.wgpuDeviceCreateComputePipelineAsync2, webgpu.WGPUCreateComputePipelineAsyncCallbackInfo2,
webgpu.WGPUCreateComputePipelineAsyncCallback2, webgpu.WGPUCreatePipelineAsyncStatus__enumvalues, 1, None, self.dev, compute_desc)
webgpu.WGPUCreateComputePipelineAsyncCallback2, webgpu.WGPUCreatePipelineAsyncStatus, 1, None, self.dev, compute_desc)
command_encoder = webgpu.wgpuDeviceCreateCommandEncoder(self.dev, webgpu.WGPUCommandEncoderDescriptor())
comp_pass_desc = webgpu.WGPUComputePassDescriptor(nextInChain=None)
@ -195,9 +195,7 @@ class WebGpuDevice(Compiled):
def __init__(self, device:str):
# Requesting an adapter
adapter_res = _run(webgpu.wgpuInstanceRequestAdapterF, webgpu.WGPURequestAdapterCallbackInfo, webgpu.WGPURequestAdapterCallback,
webgpu.WGPURequestAdapterStatus__enumvalues, 1, 2, instance,
webgpu.WGPURequestAdapterOptions(powerPreference=webgpu.WGPUPowerPreference_HighPerformance,
webgpu.WGPURequestAdapterStatus, 1, 2, instance, webgpu.WGPURequestAdapterOptions(powerPreference=webgpu.WGPUPowerPreference_HighPerformance,
backendType=backend_types.get(os.getenv("WEBGPU_BACKEND", ""), 0)))
# Get supported features
@ -215,11 +213,11 @@ class WebGpuDevice(Compiled):
# Requesting a device
device_res = _run(webgpu.wgpuAdapterRequestDeviceF, webgpu.WGPURequestDeviceCallbackInfo, webgpu.WGPURequestDeviceCallback,
webgpu.WGPURequestDeviceStatus__enumvalues, 1, 2, adapter_res, dev_desc)
webgpu.WGPURequestDeviceStatus, 1, 2, adapter_res, dev_desc)
super().__init__(device, WebGpuAllocator(device_res), [(WGSLRenderer, Compiler)],
functools.partial(WebGPUProgram, (device_res, webgpu.WGPUFeatureName_TimestampQuery in supported)))
def synchronize(self):
_run(webgpu.wgpuQueueOnSubmittedWorkDone2, webgpu.WGPUQueueWorkDoneCallbackInfo2, webgpu.WGPUQueueWorkDoneCallback2,
webgpu.WGPUQueueWorkDoneStatus__enumvalues, None, None, webgpu.wgpuDeviceGetQueue(self.runtime.args[0][0]))
webgpu.WGPUQueueWorkDoneStatus, None, None, webgpu.wgpuDeviceGetQueue(self.runtime.args[0][0]))

View file

@ -238,8 +238,6 @@ class AMDev(PCIDevImplBase):
ihdr = am.struct_ip_discovery_header.from_address(ctypes.addressof(self.bhdr) + self.bhdr.table_list[am.IP_DISCOVERY].offset)
assert self.bhdr.binary_signature == am.BINARY_SIGNATURE and ihdr.signature == am.DISCOVERY_TABLE_SIGNATURE, "discovery signatures mismatch"
# Mapping of HW IP to Discovery HW IP
hw_id_map = {am.__dict__[x]: int(y) for x,y in am.hw_id_map}
self.regs_offset:dict[int, dict[int, tuple]] = collections.defaultdict(dict)
self.ip_ver:dict[int, tuple[int, int, int]] = {}
@ -251,7 +249,7 @@ class AMDev(PCIDevImplBase):
ip = am.struct_ip_v4.from_address(ip_offset)
ba = ((ctypes.c_uint64 if ihdr.base_addr_64_bit else ctypes.c_uint32) * ip.num_base_address).from_address(ip_offset + 8)
for hw_ip in range(1, am.MAX_HWIP):
if hw_ip in hw_id_map and hw_id_map[hw_ip] == ip.hw_id:
if hw_ip in am.hw_id_map and am.hw_id_map[hw_ip] == ip.hw_id:
self.regs_offset[hw_ip][ip.instance_number] = tuple(list(ba))
self.ip_ver[hw_ip] = (ip.major, ip.minor, ip.revision)

View file

@ -427,7 +427,7 @@ class AM_PSP(AM_IP):
self._wait_for_bootloader()
if DEBUG >= 2: print(f"am {self.adev.devfmt}: loading sos component: {am.psp_fw_type__enumvalues[fw]}")
if DEBUG >= 2: print(f"am {self.adev.devfmt}: loading sos component: {am.enum_psp_fw_type.get(fw)}")
self._prep_msg1(self.adev.fw.sos_fw[fw])
self.adev.reg(f"{self.reg_pref}_36").write(self.msg1_addr >> 20)
@ -482,7 +482,7 @@ class AM_PSP(AM_IP):
def _load_ip_fw_cmd(self, fw_types:list[int], fw_bytes:memoryview):
self._prep_msg1(fw_bytes)
for fw_type in fw_types:
if DEBUG >= 2: print(f"am {self.adev.devfmt}: loading fw: {am.psp_gfx_fw_type__enumvalues[fw_type]}")
if DEBUG >= 2: print(f"am {self.adev.devfmt}: loading fw: {am.enum_psp_gfx_fw_type.get(fw_type)}")
cmd = am.struct_psp_gfx_cmd_resp(cmd_id=am.GFX_CMD_ID_LOAD_IP_FW)
cmd.cmd.cmd_load_ip_fw.fw_phy_addr_hi, cmd.cmd.cmd_load_ip_fw.fw_phy_addr_lo = data64(self.msg1_addr)
cmd.cmd.cmd_load_ip_fw.fw_size = len(fw_bytes)

View file

@ -47,7 +47,6 @@ def gen(dll, files, args=[], prolog=[], rules=[], epilog=[], recsym=False, use_e
return types[t.spelling][0]
case TK.RECORD:
# TODO: packed unions
# TODO: pragma pack support
# check for forward declaration
if t.spelling in types: types[t.spelling] = (nm:=types[t.spelling][0]), len(list(t.get_fields())) != 0
else:
@ -56,11 +55,15 @@ def gen(dll, files, args=[], prolog=[], rules=[], epilog=[], recsym=False, use_e
else: types[t.spelling] = (nm:=t.spelling.replace(' ', '_').replace('::', '_')), len(list(t.get_fields())) != 0
lines.append(f"class {nm}({'Struct' if decl.kind==CK.STRUCT_DECL else 'ctypes.Union'}): pass")
if typedef: lines.append(f"{typedef} = {nm}")
if (is_packed:=(CK.PACKED_ATTR in attrs(decl)) or ((N:=t.get_align()) != max([f.type.get_align() for f in t.get_fields()], default=N))):
if t.get_align() != 1:
print(f"WARNING: ignoring alignment={t.get_align()} on {t.spelling}")
is_packed = False
acnt = itertools.count().__next__
ll=[" ("+((fn:=f"'_{acnt()}'")+f", {tname(f.type, nm+fn[1:-1])}" if f.is_anonymous_record_decl() else f"'{f.spelling}', "+
tname(f.type, f'{nm}_{f.spelling}'))+(f',{f.get_bitfield_width()}' if f.is_bitfield() else '')+")," for f in t.get_fields()]
lines.extend(([f"{nm}._anonymous_ = ["+", ".join(f"'_{i}'" for i in range(n))+"]"] if (n:=acnt()) else [])+
([f"{nm}._packed_ = True"] * (CK.PACKED_ATTR in attrs(decl)))+([f"{nm}._fields_ = [",*ll,"]"] if ll else []))
([f"{nm}._packed_ = True"] * is_packed)+([f"{nm}._fields_ = [",*ll,"]"] if ll else []))
return nm
case TK.ENUM:
# TODO: C++ and GNU C have forward declared enums

View file

@ -1,10 +1,13 @@
import ctypes, functools, sys
from typing import TYPE_CHECKING
from tinygrad.helpers import flatten
from _ctypes import _SimpleCData
def _do_ioctl(__idir, __base, __nr, __struct, __fd, **kwargs):
def _do_ioctl(__idir, __base, __nr, __struct, __fd, *args, __payload=None, **kwargs):
import tinygrad.runtime.support.hcq as hcq, fcntl
ioctl = __fd.ioctl if isinstance(__fd, hcq.FileIOInterface) else functools.partial(fcntl.ioctl, __fd)
if (rc:=ioctl((__idir<<30)|(ctypes.sizeof(out:=__struct(**kwargs))<<16)|(__base<<8)|__nr, out)): raise RuntimeError(f"ioctl returned {rc}")
if (rc:=ioctl((__idir<<30)|(ctypes.sizeof(out:=(__payload or __struct(*args, **kwargs)))<<16)|(__base<<8)|__nr, out)):
raise RuntimeError(f"ioctl returned {rc}")
return out
def _IO(base, nr): return functools.partial(_do_ioctl, 0, ord(base) if isinstance(base, str) else base, nr, None)
@ -50,16 +53,28 @@ else:
@staticmethod
def _build(cls, fields):
o = 0
for n,t,b in [(f[0], f[1], f[2] if len(f) == 3 else 0) for f in fields]:
if b == 0: o = (o + 7) & ~7
m = (1 << (sz:=ctypes.sizeof(t)*8 if b == 0 else b)) - 1
def _s(self,v,m,s,b): self._data[:] = ((int.from_bytes(self._data,sys.byteorder)&~(m<<s))|((v&m)<<s)).to_bytes(len(self._data), sys.byteorder)
setattr(cls, n, property(functools.partial(lambda self,m,s:(int.from_bytes(self._data,sys.byteorder)>>s)&m,m=m,s=o),
functools.partial(_s,m=m,s=o,b=b)))
o += sz
offset = 0
for nm, ty, bf in [(f[0], f[1], f[2] if len(f) == 3 else 0) for f in fields]:
if bf == 0: offset = (offset + 7) & ~7
mask = (1 << (sz:=ctypes.sizeof(ty)*8 if bf == 0 else bf)) - 1
def fget(self, mask, off, ty): return ((int.from_bytes(self._data, sys.byteorder)>>off)&mask if issubclass(ty, _SimpleCData) else
ty.from_buffer(memoryview(self._data)[(st:=off//8):st+ctypes.sizeof(ty)]))
def fset(self, val, mask, off): self._data[:] = (((int.from_bytes(self._data, sys.byteorder) & ~(mask<<off))|((val&mask)<<off))
.to_bytes(len(self._data), sys.byteorder))
setattr(cls, nm, property(functools.partial(fget, mask=mask, off=offset, ty=ty), functools.partial(fset, mask=mask, off=offset)))
offset += sz
type(ctypes.Structure).__setattr__(cls, '_fields_', [('_data', ctypes.c_ubyte * ((o + 7) // 8))])
def pget(ty, s): return getattr(ty, f'_packed_{s}_', getattr(ty, f'_{s}_', []))
def get_aty(anm, fs=fields): return next(f[1] for f in fs if f[0] == anm)
def get_fnms(ty): return [f[0] for f in pget(ty, 'fields') if f[0] not in pget(ty, 'anonymous')]
if hasattr(cls, '_anonymous_'):
for anm, aty in [(a, get_aty(a)) for a in cls._anonymous_]:
for fnm in (get_fnms(aty) + flatten([get_fnms(get_aty(aanm, pget(aty, 'fields'))) for aanm in pget(aty, 'anonymous')])):
setattr(cls, fnm, property(lambda self: getattr(getattr(self, anm), fnm), lambda self, v: setattr(getattr(self, anm), fnm, v)))
setattr(cls, '_packed_anonymous_', cls._anonymous_)
setattr(cls, '_anonymous_', [])
type(ctypes.Structure).__setattr__(cls, '_fields_', [('_data', ctypes.c_ubyte * ((offset + 7) // 8))])
type(ctypes.Structure).__setattr__(cls, '_packed_', True)
setattr(cls, '_packed_fields_', fields)
@ -69,4 +84,3 @@ else:
for f,v in zip(self._packed_fields_, args): setattr(self, f[0], v)
for k,v in kwargs.items(): setattr(self, k, v)
else: super().__init__(*args, **kwargs)

View file

@ -1,14 +1,10 @@
import ctypes
from tinygrad.helpers import system
import tinygrad.runtime.autogen.comgr as comgr
assert comgr.AMD_COMGR_LANGUAGE_HIP == 4
from tinygrad.runtime.autogen import comgr
try:
comgr.amd_comgr_get_version(ctypes.byref(major:=ctypes.c_uint64()), ctypes.byref(minor:=ctypes.c_uint64()))
if major.value >= 3:
# in comgr 3 the values of enums in headers were changed: https://github.com/ROCm/llvm-project/issues/272
import tinygrad.runtime.autogen.comgr_3 as comgr # type: ignore[no-redef]
assert comgr.AMD_COMGR_LANGUAGE_HIP == 3
except AttributeError: pass # ignore if ROCm isn't installed
assert comgr.AMD_COMGR_LANGUAGE_HIP == 3 if major.value >= 3 else 4
except AttributeError: assert comgr.AMD_COMGR_LANGUAGE_HIP == 3 # if rocm is not installed, use old values
from tinygrad.device import Compiler, CompileError
from tinygrad.runtime.support.compiler_cpu import LLVMCompiler
from tinygrad.helpers import OSX, to_char_p_p

View file

@ -1,8 +1,8 @@
import ctypes, platform, sys, subprocess
from tinygrad.device import Compiler
from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG, unwrap
from tinygrad.runtime.support.elf import jit_loader
try: import tinygrad.runtime.autogen.llvm as llvm
try: from tinygrad.runtime.autogen import llvm
except (ImportError, FileNotFoundError): llvm = None #type:ignore[assignment]
class ClangJITCompiler(Compiler):
@ -24,7 +24,7 @@ class ClangJITCompiler(Compiler):
def cerr(): return ctypes.pointer(ctypes.pointer(ctypes.c_char()))
def expect(x, err, ret=None):
if x: raise RuntimeError(llvm.string_cast(err.contents) if not isinstance(err, str) else err)
if x: raise RuntimeError(unwrap(ctypes.cast(err.contents, ctypes.c_char_p).value).decode() if not isinstance(err, str) else err)
return ret
class LLVMCompiler(Compiler):
@ -50,7 +50,7 @@ class LLVMCompiler(Compiler):
self.passes = b'default<O0>'
self.diag_msgs: list[str] = []
@ctypes.CFUNCTYPE(None, llvm.LLVMDiagnosticInfoRef, ctypes.c_void_p)
@llvm.LLVMDiagnosticHandler
def handle_diag(diag_ref, _arg):
severity = llvm.LLVMGetDiagInfoSeverity(diag_ref)
msg = ctypes.string_at(llvm.LLVMGetDiagInfoDescription(diag_ref)).decode()
@ -70,7 +70,7 @@ class LLVMCompiler(Compiler):
expect(llvm.LLVMRunPasses(mod, self.passes, self.target_machine, self.pbo), 'failed to run passes')
if DEBUG >= 7: print(ctypes.string_at(llvm.LLVMPrintModuleToString(mod)).decode())
obj_buf = expect(llvm.LLVMTargetMachineEmitToMemoryBuffer(self.target_machine, mod, llvm.LLVMObjectFile, err:=cerr(),
ctypes.pointer(buf:=llvm.LLVMMemoryBufferRef())), err, buf)
buf:=llvm.LLVMMemoryBufferRef()), err, buf)
llvm.LLVMDisposeModule(mod)
obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
llvm.LLVMDisposeMemoryBuffer(obj_buf)

View file

@ -1,7 +1,7 @@
import subprocess, hashlib, tempfile, ctypes, re, pathlib
from typing import Callable
from tinygrad.helpers import to_char_p_p, colored, init_c_var, getenv, system
import tinygrad.runtime.autogen.nvrtc as nvrtc
from tinygrad.runtime.autogen import nvrtc, nvjitlink as jitlink
from tinygrad.device import Compiler, CompileError
CUDA_PATH = getenv("CUDA_PATH", "")
@ -17,8 +17,8 @@ def nvrtc_check(status, ctx=None):
def jitlink_check(status, ctx=None):
if status != 0:
err_log = _get_bytes(ctx, nvrtc.nvJitLinkGetErrorLog, nvrtc.nvJitLinkGetErrorLogSize, lambda _: None).decode() if ctx else ""
raise CompileError(f"NvJitLink Error {status}, {nvrtc.nvJitLinkResult__enumvalues.get(status, 'Unknown')}\n{err_log}")
err_log = _get_bytes(ctx, jitlink.nvJitLinkGetErrorLog, jitlink.nvJitLinkGetErrorLogSize, lambda _: None).decode() if ctx else ""
raise CompileError(f"jitlink Error {status}, {jitlink.nvJitLinkResult.get(status)}\n{err_log}")
def pretty_ptx(s):
# all expressions match `<valid_before><expr><valid_after>` and replace it with `<valid_before>color(<expr>)<valid_after>`
@ -83,12 +83,12 @@ class PTXCompiler(Compiler):
class NVPTXCompiler(PTXCompiler):
def __init__(self, arch:str):
nvrtc_check(nvrtc.nvJitLinkVersion(ctypes.byref(ctypes.c_uint()), ctypes.byref(ctypes.c_uint())))
nvrtc_check(jitlink.nvJitLinkVersion(ctypes.byref(ctypes.c_uint()), ctypes.byref(ctypes.c_uint())))
super().__init__(arch, cache_key="nv_ptx")
def compile(self, src:str) -> bytes:
jitlink_check(nvrtc.nvJitLinkCreate(handle := nvrtc.nvJitLinkHandle(), 1, to_char_p_p([f'-arch={self.arch}'.encode()])), handle)
jitlink_check(nvrtc.nvJitLinkAddData(handle, nvrtc.NVJITLINK_INPUT_PTX, ptxsrc:=super().compile(src), len(ptxsrc), "<null>".encode()), handle)
jitlink_check(nvrtc.nvJitLinkComplete(handle), handle)
data = _get_bytes(handle, nvrtc.nvJitLinkGetLinkedCubin, nvrtc.nvJitLinkGetLinkedCubinSize, jitlink_check)
jitlink_check(nvrtc.nvJitLinkDestroy(handle))
jitlink_check(jitlink.nvJitLinkCreate(handle := jitlink.nvJitLinkHandle(), 1, to_char_p_p([f'-arch={self.arch}'.encode()])), handle)
jitlink_check(jitlink.nvJitLinkAddData(handle, jitlink.NVJITLINK_INPUT_PTX, ptxsrc:=super().compile(src), len(ptxsrc), "<null>".encode()), handle)
jitlink_check(jitlink.nvJitLinkComplete(handle), handle)
data = _get_bytes(handle, jitlink.nvJitLinkGetLinkedCubin, jitlink.nvJitLinkGetLinkedCubinSize, jitlink_check)
jitlink_check(jitlink.nvJitLinkDestroy(handle))
return data

View file

@ -1,9 +1,9 @@
import base64, ctypes, pathlib, tempfile, hashlib
from tinygrad.device import Compiler
from tinygrad.helpers import cpu_objdump, system
import tinygrad.runtime.autogen.mesa as mesa
from tinygrad.runtime.autogen import mesa
from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, expect, cerr
try: import tinygrad.runtime.autogen.llvm as llvm
try: from tinygrad.runtime.autogen import llvm
except (ImportError, FileNotFoundError): llvm = None #type:ignore[assignment]
def deserialize(enc_src, opts):

View file

@ -1,5 +1,6 @@
from __future__ import annotations
import resource, ctypes, weakref, functools, itertools, tinygrad.runtime.autogen.ib as ib
import resource, ctypes, weakref, functools, itertools
from tinygrad.runtime.autogen import ib
from typing import Iterator
from dataclasses import dataclass
from weakref import WeakKeyDictionary
@ -141,7 +142,7 @@ class IBConn:
while (wr_id in self.pending_wrids) if wr_id is not None else self.pending_wrids:
if self.ctx.ctx.contents.ops.poll_cq(self.cq, _num_entries:=1, ctypes.byref(wc:=ib.struct_ibv_wc())):
if wc.status != ib.IBV_WC_SUCCESS:
raise RuntimeError(f'Work Request completed with error: wr_id={wc.wr_id} status={ib.ibv_wc_status__enumvalues.get(wc.status, wc.status)}')
raise RuntimeError(f'Work Request completed with error: wr_id={wc.wr_id} status={ib.enum_ibv_wc_status.get(wc.status, wc.status)}')
self.pending_wrids.remove(wc.wr_id)
def rdma_write(self, sgl:list[SGE]):
@ -162,7 +163,7 @@ class IBConn:
# Scatter-Gather Entry for local memory
sge = ctypes.pointer(ib.struct_ibv_sge(addr=sg.src_iova+off, length=min(sg.size-off, self.ctx.port_attr.max_msg_sz), lkey=sg.src_key))
# RDMA struct for remote memory
wr = ib.union_ibv_send_wr_wr(rdma=ib.struct_ibv_send_wr_1_rdma(remote_addr=sg.dst_iova+off, rkey=sg.dst_key))
wr = ib.struct_ibv_send_wr_wr(rdma=ib.struct_ibv_send_wr_wr_rdma(remote_addr=sg.dst_iova+off, rkey=sg.dst_key))
# Signal (with chosen work request id) if it's the last wr (first in the loop since it's reversed)
wid, flags = (wr_id, ib.IBV_SEND_SIGNALED) if swr is None else (0, 0)
# Create Send Request

View file

@ -1,11 +1,10 @@
from __future__ import annotations
import ctypes, time, array, struct, itertools, dataclasses
from typing import cast, Any
from tinygrad.runtime.autogen.nv import nv
from tinygrad.runtime.autogen import nv, nv_gpu, pci
from tinygrad.helpers import to_mv, lo32, hi32, DEBUG, round_up, round_down, mv_address, fetch, wait_cond
from tinygrad.runtime.support.system import System
from tinygrad.runtime.support.elf import elf_loader
from tinygrad.runtime.autogen import nv_gpu, pci
@dataclasses.dataclass(frozen=True)
class GRBufDesc: size:int; virt:bool; phys:bool; local:bool=False # noqa: E702
@ -69,8 +68,8 @@ class NVRpcQueue:
System.memory_barrier()
if DEBUG >= 3:
rpc_names = {**nv.c__Ea_NV_VGPU_MSG_FUNCTION_NOP__enumvalues, **nv.c__Ea_NV_VGPU_MSG_EVENT_FIRST_EVENT__enumvalues}
print(f"nv {self.gsp.nvdev.devfmt}: in RPC: {rpc_names.get(hdr.function, f'ev:{hdr.function:x}')}, res:{hdr.rpc_result:#x}")
nm = nv.rpc_fns.get(hdr.function, nv.rpc_events.get(hdr.function, f'ev:{hdr.function:x}'))
print(f"nv {self.gsp.nvdev.devfmt}: in RPC: {nm}, res:{hdr.rpc_result:#x}")
if hdr.rpc_result != 0: raise RuntimeError(f"RPC call {hdr.function} failed with result {hdr.rpc_result}")
if hdr.function == cmd: return msg
@ -443,7 +442,7 @@ class NV_GSP(NV_IP):
bufs_p = nv_gpu.struct_NV90F1_CTRL_VASPACE_COPY_SERVER_RESERVED_PDES_PARAMS(pageSize=res_sz, numLevelsToCopy=3,
virtAddrLo=res_va, virtAddrHi=res_va + res_sz - 1)
for i,pt in enumerate(self.nvdev.mm.page_tables(res_va, size=res_sz)):
bufs_p.levels[i] = nv_gpu.struct_NV90F1_CTRL_VASPACE_COPY_SERVER_RESERVED_PDES_PARAMS_0(physAddress=pt.paddr,
bufs_p.levels[i] = nv_gpu.struct_NV90F1_CTRL_VASPACE_COPY_SERVER_RESERVED_PDES_PARAMS_level(physAddress=pt.paddr,
size=self.nvdev.mm.pte_cnt[0] * 8 if i == 0 else 0x1000, pageShift=self.nvdev.mm.pte_covers[i].bit_length() - 1, aperture=1)
self.rpc_rm_control(hObject=vaspace, cmd=nv_gpu.NV90F1_CTRL_CMD_VASPACE_COPY_SERVER_RESERVED_PDES, params=bufs_p)