mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
refactor with ci
This commit is contained in:
parent
b44463aa66
commit
8e68014299
3 changed files with 141 additions and 121 deletions
231
.github/workflows/test.yml
vendored
231
.github/workflows/test.yml
vendored
|
|
@ -12,33 +12,48 @@ jobs:
|
|||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: linting
|
||||
- name: Install dependencies
|
||||
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Repo line count
|
||||
run: python3 sz.py
|
||||
- name: Lint with pylint
|
||||
run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string=' ' **/*.py
|
||||
- name: Lint with flake8
|
||||
run: flake8 --statistics -j4
|
||||
- name: Lint tinygrad with pylint
|
||||
run: pylint tinygrad/
|
||||
- name: Run mypy
|
||||
run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
|
||||
- name: Install SLOCCount
|
||||
run: sudo apt install sloccount
|
||||
- name: Check <5000 lines
|
||||
run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: linting
|
||||
- name: Install dependencies
|
||||
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Repo line count
|
||||
run: python3 sz.py
|
||||
- name: Lint with pylint
|
||||
run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string=' ' **/*.py
|
||||
- name: Lint with flake8
|
||||
run: flake8 --statistics -j4
|
||||
- name: Lint tinygrad with pylint
|
||||
run: pylint tinygrad/
|
||||
- name: Run mypy
|
||||
run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
|
||||
- name: Install SLOCCount
|
||||
run: sudo apt install sloccount
|
||||
- name: Check <5000 lines
|
||||
run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
|
||||
|
||||
testarm64:
|
||||
name: ARM64 Test
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Setup qemu
|
||||
uses: docker/setup-qemu-action@v1
|
||||
with:
|
||||
platforms: all
|
||||
- name: Install dependencies
|
||||
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Test Ops
|
||||
run: ARM64=1 CLANG=1 python -m pytest -n=auto test/test_ops.py
|
||||
|
||||
testcpuimagenet:
|
||||
name: CPU and ImageNet to C Tests
|
||||
|
|
@ -46,35 +61,35 @@ jobs:
|
|||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: testing
|
||||
- name: Install Dependencies
|
||||
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Test Docs
|
||||
run: python docs/abstractions.py
|
||||
- name: Test Quickstart
|
||||
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python3 quickstart.py
|
||||
- name: Run Pytest
|
||||
run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
|
||||
- name: Fuzz Test symbolic
|
||||
run: python test/external/fuzz_symbolic.py
|
||||
- name: Fuzz Test shapetracker
|
||||
run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
|
||||
- name: Compile EfficientNet to C
|
||||
run: PYTHONPATH="." CLANG=1 python3 examples/compile_efficientnet.py > recognize.c
|
||||
- name: Compile C to native
|
||||
run: clang -O2 recognize.c -lm -o recognize
|
||||
- name: Test EfficientNet
|
||||
run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: testing
|
||||
- name: Install Dependencies
|
||||
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Test Docs
|
||||
run: python docs/abstractions.py
|
||||
- name: Test Quickstart
|
||||
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python3 quickstart.py
|
||||
- name: Run Pytest
|
||||
run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
|
||||
- name: Fuzz Test symbolic
|
||||
run: python test/external/fuzz_symbolic.py
|
||||
- name: Fuzz Test shapetracker
|
||||
run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
|
||||
- name: Compile EfficientNet to C
|
||||
run: PYTHONPATH="." CLANG=1 python3 examples/compile_efficientnet.py > recognize.c
|
||||
- name: Compile C to native
|
||||
run: clang -O2 recognize.c -lm -o recognize
|
||||
- name: Test EfficientNet
|
||||
run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
|
||||
|
||||
testtorch:
|
||||
name: Torch Tests
|
||||
|
|
@ -82,23 +97,23 @@ jobs:
|
|||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: testing
|
||||
- name: Install Dependencies
|
||||
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Run Pytest
|
||||
run: TORCH=1 python -m pytest -n=auto test/
|
||||
- name: Run ONNX
|
||||
run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/.cache/pip
|
||||
key: testing
|
||||
- name: Install Dependencies
|
||||
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Run Pytest
|
||||
run: TORCH=1 python -m pytest -n=auto test/
|
||||
- name: Run ONNX
|
||||
run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true
|
||||
|
||||
testopencl:
|
||||
strategy:
|
||||
|
|
@ -158,33 +173,33 @@ jobs:
|
|||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.11
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.11
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/Library/Caches/pip
|
||||
key: metalwebgpu
|
||||
- name: Install Dependencies
|
||||
run: pip install -e '.[metal,webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Test LLaMA compile speed
|
||||
run: PYTHONPATH="." METAL=1 python3 test/external/external_test_speed_llama.py
|
||||
#- name: Run dtype test
|
||||
# run: DEBUG=4 METAL=1 python -m pytest test/test_dtype.py
|
||||
# dtype test has issues on test_half_to_int8
|
||||
- name: Run metal ops test
|
||||
run: DEBUG=2 METAL=1 python -m pytest test/test_ops.py
|
||||
- name: Run JIT test
|
||||
run: DEBUG=2 METAL=1 python -m pytest test/test_jit.py
|
||||
# TODO: why not testing the whole test/?
|
||||
- name: Run webgpu pytest
|
||||
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
|
||||
- name: Build WEBGPU Efficientnet
|
||||
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Set up Python 3.11
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.11
|
||||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: ~/Library/Caches/pip
|
||||
key: metalwebgpu
|
||||
- name: Install Dependencies
|
||||
run: pip install -e '.[metal,webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
- name: Test LLaMA compile speed
|
||||
run: PYTHONPATH="." METAL=1 python3 test/external/external_test_speed_llama.py
|
||||
#- name: Run dtype test
|
||||
# run: DEBUG=4 METAL=1 python -m pytest test/test_dtype.py
|
||||
# dtype test has issues on test_half_to_int8
|
||||
- name: Run metal ops test
|
||||
run: DEBUG=2 METAL=1 python -m pytest test/test_ops.py
|
||||
- name: Run JIT test
|
||||
run: DEBUG=2 METAL=1 python -m pytest test/test_jit.py
|
||||
# TODO: why not testing the whole test/?
|
||||
- name: Run webgpu pytest
|
||||
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
|
||||
- name: Build WEBGPU Efficientnet
|
||||
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu
|
||||
|
||||
testdocker:
|
||||
name: Docker Test
|
||||
|
|
@ -192,12 +207,12 @@ jobs:
|
|||
if: ${{ false }}
|
||||
|
||||
steps:
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Build Docker
|
||||
run: docker build -t tinygrad -f test/Dockerfile .
|
||||
- name: Test Docker
|
||||
run: docker run --rm tinygrad /usr/bin/env python3 -c "from tinygrad.tensor import Tensor; print(Tensor.eye(3).numpy())"
|
||||
- name: Checkout Code
|
||||
uses: actions/checkout@v3
|
||||
- name: Build Docker
|
||||
run: docker build -t tinygrad -f test/Dockerfile .
|
||||
- name: Test Docker
|
||||
run: docker run --rm tinygrad /usr/bin/env python3 -c "from tinygrad.tensor import Tensor; print(Tensor.eye(3).numpy())"
|
||||
|
||||
tests:
|
||||
strategy:
|
||||
|
|
@ -218,7 +233,7 @@ jobs:
|
|||
- name: Cache pip
|
||||
uses: actions/cache@v3
|
||||
with:
|
||||
path: '~/.cache/pip'
|
||||
path: "~/.cache/pip"
|
||||
key: ${{ matrix.backend }}
|
||||
- name: Set env
|
||||
run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV
|
||||
|
|
@ -270,4 +285,4 @@ jobs:
|
|||
run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
|
||||
- name: Run pytest (cuda)
|
||||
if: matrix.backend=='cuda'
|
||||
run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
|
||||
run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
|
||||
|
|
|
|||
|
|
@ -38,12 +38,16 @@ class AssemblyCodegen(Linearizer):
|
|||
# s registers are the addresses and non local indexes
|
||||
def codegen(self):
|
||||
self.process()
|
||||
if not getenv("CLANG"): self.hand_coded_optimizations()
|
||||
self.limit_global_dims(3) # all GPU asms have 3 (for now)
|
||||
self.hand_coded_optimizations()
|
||||
if not getenv("CLANG"):
|
||||
self.limit_global_dims(3) # all GPU asms have 3 (for now)
|
||||
self.linearize()
|
||||
|
||||
cnts:DefaultDict[Tuple[DType, bool], int] = defaultdict(int)
|
||||
tor: Dict[Any, Register] = {}
|
||||
buf_to_dtype = {args[0]:args[1] for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL}
|
||||
buf_index = {x:i for i,x in enumerate(buf_to_dtype.keys())}
|
||||
|
||||
def newreg(tok, dtype=dtypes.float32, scalar=False):
|
||||
nonlocal cnts, tor
|
||||
if isinstance(tok, Token): dtype = tok.dtype # this
|
||||
|
|
@ -82,7 +86,7 @@ class AssemblyCodegen(Linearizer):
|
|||
AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: render_alu(BinaryOps.MUL, a, b.render(ops,ctx), dtype=dtypes.bool), self.nodes[1:], self.nodes[0].render(ops,ctx)) }
|
||||
|
||||
def addr_w_offset(args):
|
||||
idx = args.idx*args.memory_dtype.itemsize
|
||||
idx = args.idx*buf_to_dtype[args.name].itemsize
|
||||
off = 0 # TODO: should this be None?
|
||||
if isinstance(idx, SumNode):
|
||||
nums = [n.b for n in idx.nodes if isinstance(n, NumNode)]
|
||||
|
|
@ -95,15 +99,15 @@ class AssemblyCodegen(Linearizer):
|
|||
new_reg = newreg((reg.nm, 'vec'), dtype=reg.dtype)
|
||||
ins.append(AssemblyInstruction(UOps.ALU, new_reg, [reg], UnaryOps.NOOP))
|
||||
reg = new_reg
|
||||
return tor[f"buf{args.i}"], reg, off
|
||||
reg = render_alu(BinaryOps.ADD, render_cast(reg, dtypes.uint64), tor[args[0]], dtype=dtypes.uint64)
|
||||
return tor[f"buf{buf_index[args.name]}"], reg, off
|
||||
reg = render_alu(BinaryOps.ADD, render_cast(reg, dtypes.uint64), tor[f"buf{buf_index[args.name]}"], dtype=dtypes.uint64)
|
||||
return reg, None, off
|
||||
|
||||
buf_to_dtype = {args[0]:args[1] for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL}
|
||||
buf_index = {x:i for i,x in enumerate(buf_to_dtype.keys())}
|
||||
|
||||
ins = []
|
||||
ins += [AssemblyInstruction(UOps.DEFINE_GLOBAL, newreg(args[0], dtype=dtypes.uint64, scalar=True), [], args[0]) for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL]
|
||||
ins += [AssemblyInstruction(UOps.SPECIAL, newreg(f"buf{i}", dtype=dtypes.uint64, scalar=True), [], f"buf{i}") for i in range(len(self.bufs))]
|
||||
global_size, local_size = [], []
|
||||
skipload_branch = 0
|
||||
for uop,newvar,vin,args in self.uops:
|
||||
|
|
@ -166,13 +170,13 @@ class AssemblyCodegen(Linearizer):
|
|||
ins.append(AssemblyInstruction(UOps.COND_BRANCH, None, [pred], (f"$skipload_{skipload_branch}", False)))
|
||||
if args.valid.max == 1:
|
||||
# NOTE: you can't compute the index in here, because it assumes it's all available later
|
||||
ins.append(AssemblyInstruction(UOps.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global', args.memory_dtype if buf_to_dtype[args.name] != dtypes.float else None))) #if args.i != -1 else 'shared')
|
||||
ins.append(AssemblyInstruction(UOps.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global' if buf_index[args.name] != -1 else 'shared', args.memory_dtype if buf_to_dtype[args.name] != dtypes.float else None))) #if args.i != -1 else 'shared')
|
||||
if args.valid.min == 0 and args.valid.max == 1:
|
||||
ins.append(AssemblyInstruction(UOps.LABEL, None, [], f"$skipload_{skipload_branch}"))
|
||||
skipload_branch += 1
|
||||
elif uop == UOps.STORE:
|
||||
idx, treg, off = addr_w_offset(args)
|
||||
ins.append(AssemblyInstruction(UOps.STORE, None, [idx, tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global', args.memory_dtype if buf_to_dtype['data0'] != dtypes.float else None))) #if args.i != -1 else 'shared')
|
||||
ins.append(AssemblyInstruction(UOps.STORE, None, [idx, tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if buf_index[args.name] != -1 else 'shared', args.memory_dtype if buf_to_dtype['data0'] != dtypes.float else None))) #if args.i != -1 else 'shared')
|
||||
|
||||
# define registers
|
||||
ins = [AssemblyInstruction(UOps.DEFINE_REGISTER, None, [], (dtype, type_to_letter(dtype), c)) for dtype,c in cnts.items()] + ins
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ def compute_offsets(total):
|
|||
def get_op(op): return f"bl {'_' if system() == 'Darwin' else ''}{op}"
|
||||
|
||||
class ARM64Codegen(AssemblyCodegen):
|
||||
def hand_coded_optimizations(self): pass
|
||||
def specialize(self, asm):
|
||||
var_size = 0
|
||||
prev_uop = None
|
||||
|
|
@ -28,7 +29,7 @@ class ARM64Codegen(AssemblyCodegen):
|
|||
|
||||
def mov_imm(value, to):
|
||||
# Manually move value into reg if vin[1] can't fit
|
||||
if value > 65535:
|
||||
if abs(value) > abs(65535) and abs(value) != float('inf'):
|
||||
ins.append(f"movz w15, #{value & 0xffff}")
|
||||
ins.append(f"movk w15, #{(value >> 16) & 0xffff}, lsl #16")
|
||||
ins.append(f"sxtw {to}, w15")
|
||||
|
|
@ -75,11 +76,11 @@ class ARM64Codegen(AssemblyCodegen):
|
|||
ins.append(f"mov x15, {mem_vars[v.nm]}")
|
||||
ins.append(f"ldr {rtor[v.nm]}, [sp, x15]")
|
||||
|
||||
if uop == UOps.DEFINE_GLOBAL:
|
||||
if arg.startswith('data'):
|
||||
if uop == UOps.SPECIAL:
|
||||
if arg.startswith('buf'):
|
||||
# data 8 to n into the stack
|
||||
if int(arg[4:]) >= 8:
|
||||
ins.append(f"ldr x15, [x19, #{(int(arg[4:]) - 8) * 8}]")
|
||||
if int(arg[3:]) >= 8:
|
||||
ins.append(f"ldr x15, [x19, #{(int(arg[3:]) - 8) * 8}]")
|
||||
ins.append(f"mov {rtor[out.nm]}, x15")
|
||||
elif uop == UOps.CAST:
|
||||
if arg == BinaryOps.CMPEQ:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue