Support CLANG backend on Windows (#8768)

* Support CLANG on Windows

* Put both backends in a windows ci

* remove coff loader

* use memmove

---------

Co-authored-by: b1tg <b1tg@users.noreply.github.com>
Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
b1tg 2025-01-28 17:19:34 +08:00 committed by GitHub
commit 5d62aa28dc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 58 additions and 37 deletions

View file

@ -622,10 +622,7 @@ jobs:
wintests:
strategy:
fail-fast: false
matrix:
backend: [llvm]
name: Tests on Windows (${{ matrix.backend }})
name: Tests on Windows (llvm+clang)
runs-on: windows-latest
timeout-minutes: 45
steps:
@ -642,20 +639,34 @@ jobs:
with:
path: ${{ env.Python3_ROOT_DIR }}\Lib\site-packages
key: windows-${{ matrix.backend }}-packages-${{ hashFiles('**/setup.py') }}
- name: Set env
shell: bash
run: |
if [ "${{ matrix.backend }}" = "clang" ]; then
echo "CLANG=1" >> $GITHUB_ENV
elif [ "${{ matrix.backend }}" = "llvm" ]; then
echo "LLVM=1" >> $GITHUB_ENV
fi
- name: Install dependencies
run: pip install --user -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Check Device.DEFAULT and print some source
env:
DEBUG: 5
LLVM: 1
PYTHONPATH: ${{ github.workspace }}
- name: Check Device.DEFAULT and print some source (llvm)
shell: bash
run: |
python3 test/test_ops.py TestOps.test_add
- name: Run pytest
env:
DEBUG: 5
LLVM: 1
run: python -m pytest -n=auto test/test_tiny.py --durations=20
PYTHONPATH=${{ github.workspace }} LLVM=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT"
DEBUG=5 PYTHONPATH=${{ github.workspace }} LLVM=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
- name: Check Device.DEFAULT and print some source (clang)
shell: bash
run: |
PYTHONPATH=${{ github.workspace }} CLANG=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT"
DEBUG=5 PYTHONPATH=${{ github.workspace }} CLANG=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
- name: Run pytest (llvm)
shell: bash
run: |
DEBUG=5 LLVM=1 python -m pytest -n=auto test/test_tiny.py --durations=20
- name: Run pytest (clang)
shell: bash
run: |
DEBUG=5 CLANG=1 python -m pytest -n=auto test/test_tiny.py --durations=20
#testunicorn:
# name: ARM64 unicorn Test

View file

@ -222,23 +222,31 @@ MAP_JIT = 0x0800
class CPUProgram:
helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32' if sys.platform == "win32" else 'gcc_s'))
def __init__(self, name:str, lib:bytes):
assert sys.platform != "win32", "clang is not supported for windows yet"
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
if sys.platform == "win32":
PAGE_EXECUTE_READWRITE = 0x40
MEM_COMMIT = 0x1000
MEM_RESERVE = 0x2000
ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_uint64
ptr = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_int(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
ctypes.memmove(ptr, lib, len(lib))
self.fxn = ctypes.CFUNCTYPE(None)(ptr)
else:
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
self.mem.write(lib)
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
def __call__(self, *bufs, vals=(), wait=False):
args = list(bufs) + list(vals)

View file

@ -272,7 +272,7 @@ def cpu_objdump(lib, objdump_tool='objdump'):
def capstone_flatdump(lib: bytes):
import capstone
match platform.machine():
case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
case 'x86_64' | 'AMD64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
for instr in cs.disasm(lib, 0):

View file

@ -1,5 +1,5 @@
from typing import Optional, Union, Literal, Callable, cast
import os, math
import os, math, sys
from collections import defaultdict, Counter
from tinygrad.ops import GroupOp, Ops, UOp, PatternMatcher, UPat
from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX
@ -178,7 +178,8 @@ class ClangRenderer(CStyleLanguage):
tensor_cores = [TensorCore(dims=(sz,sz,1), threads=1, elements_per_thread=(sz,sz,sz*sz), dtype_in=dt, dtype_out=dt,
swizzle=(None, ((),(4,5,6,7,0,1,2,3))), opts=("u0","u0","u0","u0","u1","u1","u1","u1"))
for dt,sz in [(dt, 64 // dt.itemsize) for dt in [dtypes.float]]]
if sys.platform == 'win32':
kernel_prefix = "__attribute__((ms_abi)) "
def render_vector_prefix(self, dt:DType) -> str:
return f"typedef {self.render_dtype(dt.scalar())} {self.render_dtype(dt)} __attribute__((aligned({(sz:=dt.itemsize)}),vector_size({sz})));"

View file

@ -1,4 +1,4 @@
import platform, tempfile, pathlib, subprocess
import platform, tempfile, pathlib, subprocess, sys
from tinygrad.helpers import cpu_objdump, capstone_flatdump
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
from tinygrad.runtime.support.elf import jit_loader
@ -26,7 +26,8 @@ class ClangJITCompiler(Compiler):
def compile(self, src:str) -> bytes:
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
return jit_loader(obj)