mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Support CLANG backend on Windows (#8768)
* Support CLANG on Windows * Put both backends in a windows ci * remove coff loader * use memmove --------- Co-authored-by: b1tg <b1tg@users.noreply.github.com> Co-authored-by: George Hotz <72895+geohot@users.noreply.github.com>
This commit is contained in:
parent
af0452f116
commit
5d62aa28dc
5 changed files with 58 additions and 37 deletions
41
.github/workflows/test.yml
vendored
41
.github/workflows/test.yml
vendored
|
|
@ -622,10 +622,7 @@ jobs:
|
|||
wintests:
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
backend: [llvm]
|
||||
|
||||
name: Tests on Windows (${{ matrix.backend }})
|
||||
name: Tests on Windows (llvm+clang)
|
||||
runs-on: windows-latest
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
|
|
@ -642,20 +639,34 @@ jobs:
|
|||
with:
|
||||
path: ${{ env.Python3_ROOT_DIR }}\Lib\site-packages
|
||||
key: windows-${{ matrix.backend }}-packages-${{ hashFiles('**/setup.py') }}
|
||||
- name: Set env
|
||||
shell: bash
|
||||
run: |
|
||||
if [ "${{ matrix.backend }}" = "clang" ]; then
|
||||
echo "CLANG=1" >> $GITHUB_ENV
|
||||
elif [ "${{ matrix.backend }}" = "llvm" ]; then
|
||||
echo "LLVM=1" >> $GITHUB_ENV
|
||||
fi
|
||||
- name: Install dependencies
|
||||
run: pip install --user -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Check Device.DEFAULT and print some source
|
||||
env:
|
||||
DEBUG: 5
|
||||
LLVM: 1
|
||||
PYTHONPATH: ${{ github.workspace }}
|
||||
- name: Check Device.DEFAULT and print some source (llvm)
|
||||
shell: bash
|
||||
run: |
|
||||
python3 test/test_ops.py TestOps.test_add
|
||||
- name: Run pytest
|
||||
env:
|
||||
DEBUG: 5
|
||||
LLVM: 1
|
||||
run: python -m pytest -n=auto test/test_tiny.py --durations=20
|
||||
PYTHONPATH=${{ github.workspace }} LLVM=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'LLVM', Device.DEFAULT"
|
||||
DEBUG=5 PYTHONPATH=${{ github.workspace }} LLVM=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
|
||||
- name: Check Device.DEFAULT and print some source (clang)
|
||||
shell: bash
|
||||
run: |
|
||||
PYTHONPATH=${{ github.workspace }} CLANG=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == 'CLANG', Device.DEFAULT"
|
||||
DEBUG=5 PYTHONPATH=${{ github.workspace }} CLANG=1 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
|
||||
- name: Run pytest (llvm)
|
||||
shell: bash
|
||||
run: |
|
||||
DEBUG=5 LLVM=1 python -m pytest -n=auto test/test_tiny.py --durations=20
|
||||
- name: Run pytest (clang)
|
||||
shell: bash
|
||||
run: |
|
||||
DEBUG=5 CLANG=1 python -m pytest -n=auto test/test_tiny.py --durations=20
|
||||
|
||||
#testunicorn:
|
||||
# name: ARM64 unicorn Test
|
||||
|
|
|
|||
|
|
@ -222,23 +222,31 @@ MAP_JIT = 0x0800
|
|||
class CPUProgram:
|
||||
helper_handle = ctypes.CDLL(ctypes.util.find_library('System' if OSX else 'kernel32' if sys.platform == "win32" else 'gcc_s'))
|
||||
def __init__(self, name:str, lib:bytes):
|
||||
assert sys.platform != "win32", "clang is not supported for windows yet"
|
||||
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
|
||||
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
|
||||
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
|
||||
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
|
||||
if sys.platform == "win32":
|
||||
PAGE_EXECUTE_READWRITE = 0x40
|
||||
MEM_COMMIT = 0x1000
|
||||
MEM_RESERVE = 0x2000
|
||||
ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_uint64
|
||||
ptr = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0), ctypes.c_int(len(lib)), MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE)
|
||||
ctypes.memmove(ptr, lib, len(lib))
|
||||
self.fxn = ctypes.CFUNCTYPE(None)(ptr)
|
||||
else:
|
||||
from mmap import mmap, PROT_READ, PROT_WRITE, PROT_EXEC, MAP_ANON, MAP_PRIVATE
|
||||
# On apple silicon with SPRR enabled (it always is in macos) RWX pages are unrepresentable: https://blog.svenpeter.dev/posts/m1_sprr_gxf/
|
||||
# MAP_JIT allows us to easily flip pages from RW- to R-X and vice versa. It is a noop on intel cpus. (man pthread_jit_write_protect_np)
|
||||
self.mem = mmap(-1, len(lib), MAP_ANON | MAP_PRIVATE | (MAP_JIT if OSX else 0), PROT_READ | PROT_WRITE | PROT_EXEC)
|
||||
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
|
||||
self.mem.write(lib)
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(False)
|
||||
self.mem.write(lib)
|
||||
if OSX: CPUProgram.helper_handle.pthread_jit_write_protect_np(True)
|
||||
|
||||
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
|
||||
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
|
||||
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
|
||||
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
|
||||
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
|
||||
# __clear_cache isn't a normal libc function, but a compiler support routine found in libgcc_s for gcc and compiler-rt for clang.
|
||||
# libgcc_s comes as shared library but compiler-rt is only a bunch of static library archives which we can't directly load, but fortunately
|
||||
# it somehow found its way into libSystem on macos (likely because it used __builtin_clear_cache) and libgcc_s is ~always present on linux
|
||||
# Using ["name"] instead of .name because otherwise name is getting mangled: https://docs.python.org/3.12/reference/expressions.html#index-5
|
||||
CPUProgram.helper_handle["__clear_cache"](ctypes.c_void_p(mv_address(self.mem)), ctypes.c_void_p(mv_address(self.mem) + len(lib)))
|
||||
|
||||
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
|
||||
self.fxn = ctypes.CFUNCTYPE(None)(mv_address(self.mem))
|
||||
|
||||
def __call__(self, *bufs, vals=(), wait=False):
|
||||
args = list(bufs) + list(vals)
|
||||
|
|
|
|||
|
|
@ -272,7 +272,7 @@ def cpu_objdump(lib, objdump_tool='objdump'):
|
|||
def capstone_flatdump(lib: bytes):
|
||||
import capstone
|
||||
match platform.machine():
|
||||
case 'x86_64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
|
||||
case 'x86_64' | 'AMD64': cs = capstone.Cs(capstone.CS_ARCH_X86, capstone.CS_MODE_64)
|
||||
case 'aarch64' | 'arm64': cs = capstone.Cs(capstone.CS_ARCH_ARM64, capstone.CS_MODE_ARM)
|
||||
case machine: raise NotImplementedError(f"Capstone disassembly isn't supported for {machine}")
|
||||
for instr in cs.disasm(lib, 0):
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
from typing import Optional, Union, Literal, Callable, cast
|
||||
import os, math
|
||||
import os, math, sys
|
||||
from collections import defaultdict, Counter
|
||||
from tinygrad.ops import GroupOp, Ops, UOp, PatternMatcher, UPat
|
||||
from tinygrad.helpers import strip_parens, getenv, prod, dedup, AMX
|
||||
|
|
@ -178,7 +178,8 @@ class ClangRenderer(CStyleLanguage):
|
|||
tensor_cores = [TensorCore(dims=(sz,sz,1), threads=1, elements_per_thread=(sz,sz,sz*sz), dtype_in=dt, dtype_out=dt,
|
||||
swizzle=(None, ((),(4,5,6,7,0,1,2,3))), opts=("u0","u0","u0","u0","u1","u1","u1","u1"))
|
||||
for dt,sz in [(dt, 64 // dt.itemsize) for dt in [dtypes.float]]]
|
||||
|
||||
if sys.platform == 'win32':
|
||||
kernel_prefix = "__attribute__((ms_abi)) "
|
||||
def render_vector_prefix(self, dt:DType) -> str:
|
||||
return f"typedef {self.render_dtype(dt.scalar())} {self.render_dtype(dt)} __attribute__((aligned({(sz:=dt.itemsize)}),vector_size({sz})));"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import platform, tempfile, pathlib, subprocess
|
||||
import platform, tempfile, pathlib, subprocess, sys
|
||||
from tinygrad.helpers import cpu_objdump, capstone_flatdump
|
||||
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
|
||||
from tinygrad.runtime.support.elf import jit_loader
|
||||
|
|
@ -26,7 +26,8 @@ class ClangJITCompiler(Compiler):
|
|||
def compile(self, src:str) -> bytes:
|
||||
# -fno-math-errno is required for __builtin_sqrt to become an instruction instead of a function call
|
||||
# x18 is a reserved platform register. It is clobbered on context switch in macos and is used to store TEB pointer in windows on arm, don't use it
|
||||
args = ['-march=native', f'--target={platform.machine()}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
|
||||
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
|
||||
args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
|
||||
arch_args = ['-ffixed-x18'] if platform.machine() == 'arm64' else []
|
||||
obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
|
||||
return jit_loader(obj)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue