refactor with ci

This commit is contained in:
Steven Anderson 2023-07-31 22:34:29 -04:00
commit 8e68014299
3 changed files with 141 additions and 121 deletions

View file

@ -12,33 +12,48 @@ jobs:
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: linting
- name: Install dependencies
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Repo line count
run: python3 sz.py
- name: Lint with pylint
run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string=' ' **/*.py
- name: Lint with flake8
run: flake8 --statistics -j4
- name: Lint tinygrad with pylint
run: pylint tinygrad/
- name: Run mypy
run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
- name: Install SLOCCount
run: sudo apt install sloccount
- name: Check <5000 lines
run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: linting
- name: Install dependencies
run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Repo line count
run: python3 sz.py
- name: Lint with pylint
run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string=' ' **/*.py
- name: Lint with flake8
run: flake8 --statistics -j4
- name: Lint tinygrad with pylint
run: pylint tinygrad/
- name: Run mypy
run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
- name: Install SLOCCount
run: sudo apt install sloccount
- name: Check <5000 lines
run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
testarm64:
name: ARM64 Test
runs-on: ubuntu-latest
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Setup qemu
uses: docker/setup-qemu-action@v1
with:
platforms: all
- name: Install dependencies
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test Ops
run: ARM64=1 CLANG=1 python -m pytest -n=auto test/test_ops.py
testcpuimagenet:
name: CPU and ImageNet to C Tests
@ -46,35 +61,35 @@ jobs:
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: testing
- name: Install Dependencies
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test Docs
run: python docs/abstractions.py
- name: Test Quickstart
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python3 quickstart.py
- name: Run Pytest
run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
- name: Fuzz Test symbolic
run: python test/external/fuzz_symbolic.py
- name: Fuzz Test shapetracker
run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
- name: Compile EfficientNet to C
run: PYTHONPATH="." CLANG=1 python3 examples/compile_efficientnet.py > recognize.c
- name: Compile C to native
run: clang -O2 recognize.c -lm -o recognize
- name: Test EfficientNet
run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: testing
- name: Install Dependencies
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test Docs
run: python docs/abstractions.py
- name: Test Quickstart
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python3 quickstart.py
- name: Run Pytest
run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
- name: Fuzz Test symbolic
run: python test/external/fuzz_symbolic.py
- name: Fuzz Test shapetracker
run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
- name: Compile EfficientNet to C
run: PYTHONPATH="." CLANG=1 python3 examples/compile_efficientnet.py > recognize.c
- name: Compile C to native
run: clang -O2 recognize.c -lm -o recognize
- name: Test EfficientNet
run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
testtorch:
name: Torch Tests
@ -82,23 +97,23 @@ jobs:
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: testing
- name: Install Dependencies
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Run Pytest
run: TORCH=1 python -m pytest -n=auto test/
- name: Run ONNX
run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.8
uses: actions/setup-python@v4
with:
python-version: 3.8
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/.cache/pip
key: testing
- name: Install Dependencies
run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Run Pytest
run: TORCH=1 python -m pytest -n=auto test/
- name: Run ONNX
run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true
testopencl:
strategy:
@ -158,33 +173,33 @@ jobs:
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/Library/Caches/pip
key: metalwebgpu
- name: Install Dependencies
run: pip install -e '.[metal,webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test LLaMA compile speed
run: PYTHONPATH="." METAL=1 python3 test/external/external_test_speed_llama.py
#- name: Run dtype test
# run: DEBUG=4 METAL=1 python -m pytest test/test_dtype.py
# dtype test has issues on test_half_to_int8
- name: Run metal ops test
run: DEBUG=2 METAL=1 python -m pytest test/test_ops.py
- name: Run JIT test
run: DEBUG=2 METAL=1 python -m pytest test/test_jit.py
# TODO: why not testing the whole test/?
- name: Run webgpu pytest
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
- name: Build WEBGPU Efficientnet
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu
- name: Checkout Code
uses: actions/checkout@v3
- name: Set up Python 3.11
uses: actions/setup-python@v4
with:
python-version: 3.11
- name: Cache pip
uses: actions/cache@v3
with:
path: ~/Library/Caches/pip
key: metalwebgpu
- name: Install Dependencies
run: pip install -e '.[metal,webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
- name: Test LLaMA compile speed
run: PYTHONPATH="." METAL=1 python3 test/external/external_test_speed_llama.py
#- name: Run dtype test
# run: DEBUG=4 METAL=1 python -m pytest test/test_dtype.py
# dtype test has issues on test_half_to_int8
- name: Run metal ops test
run: DEBUG=2 METAL=1 python -m pytest test/test_ops.py
- name: Run JIT test
run: DEBUG=2 METAL=1 python -m pytest test/test_jit.py
# TODO: why not testing the whole test/?
- name: Run webgpu pytest
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
- name: Build WEBGPU Efficientnet
run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu
testdocker:
name: Docker Test
@ -192,12 +207,12 @@ jobs:
if: ${{ false }}
steps:
- name: Checkout Code
uses: actions/checkout@v3
- name: Build Docker
run: docker build -t tinygrad -f test/Dockerfile .
- name: Test Docker
run: docker run --rm tinygrad /usr/bin/env python3 -c "from tinygrad.tensor import Tensor; print(Tensor.eye(3).numpy())"
- name: Checkout Code
uses: actions/checkout@v3
- name: Build Docker
run: docker build -t tinygrad -f test/Dockerfile .
- name: Test Docker
run: docker run --rm tinygrad /usr/bin/env python3 -c "from tinygrad.tensor import Tensor; print(Tensor.eye(3).numpy())"
tests:
strategy:
@ -218,7 +233,7 @@ jobs:
- name: Cache pip
uses: actions/cache@v3
with:
path: '~/.cache/pip'
path: "~/.cache/pip"
key: ${{ matrix.backend }}
- name: Set env
run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV
@ -270,4 +285,4 @@ jobs:
run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
- name: Run pytest (cuda)
if: matrix.backend=='cuda'
run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models

View file

@ -38,12 +38,16 @@ class AssemblyCodegen(Linearizer):
# s registers are the addresses and non local indexes
def codegen(self):
self.process()
if not getenv("CLANG"): self.hand_coded_optimizations()
self.limit_global_dims(3) # all GPU asms have 3 (for now)
self.hand_coded_optimizations()
if not getenv("CLANG"):
self.limit_global_dims(3) # all GPU asms have 3 (for now)
self.linearize()
cnts:DefaultDict[Tuple[DType, bool], int] = defaultdict(int)
tor: Dict[Any, Register] = {}
buf_to_dtype = {args[0]:args[1] for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL}
buf_index = {x:i for i,x in enumerate(buf_to_dtype.keys())}
def newreg(tok, dtype=dtypes.float32, scalar=False):
nonlocal cnts, tor
if isinstance(tok, Token): dtype = tok.dtype # this
@ -82,7 +86,7 @@ class AssemblyCodegen(Linearizer):
AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: render_alu(BinaryOps.MUL, a, b.render(ops,ctx), dtype=dtypes.bool), self.nodes[1:], self.nodes[0].render(ops,ctx)) }
def addr_w_offset(args):
idx = args.idx*args.memory_dtype.itemsize
idx = args.idx*buf_to_dtype[args.name].itemsize
off = 0 # TODO: should this be None?
if isinstance(idx, SumNode):
nums = [n.b for n in idx.nodes if isinstance(n, NumNode)]
@ -95,15 +99,15 @@ class AssemblyCodegen(Linearizer):
new_reg = newreg((reg.nm, 'vec'), dtype=reg.dtype)
ins.append(AssemblyInstruction(UOps.ALU, new_reg, [reg], UnaryOps.NOOP))
reg = new_reg
return tor[f"buf{args.i}"], reg, off
reg = render_alu(BinaryOps.ADD, render_cast(reg, dtypes.uint64), tor[args[0]], dtype=dtypes.uint64)
return tor[f"buf{buf_index[args.name]}"], reg, off
reg = render_alu(BinaryOps.ADD, render_cast(reg, dtypes.uint64), tor[f"buf{buf_index[args.name]}"], dtype=dtypes.uint64)
return reg, None, off
buf_to_dtype = {args[0]:args[1] for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL}
buf_index = {x:i for i,x in enumerate(buf_to_dtype.keys())}
ins = []
ins += [AssemblyInstruction(UOps.DEFINE_GLOBAL, newreg(args[0], dtype=dtypes.uint64, scalar=True), [], args[0]) for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL]
ins += [AssemblyInstruction(UOps.SPECIAL, newreg(f"buf{i}", dtype=dtypes.uint64, scalar=True), [], f"buf{i}") for i in range(len(self.bufs))]
global_size, local_size = [], []
skipload_branch = 0
for uop,newvar,vin,args in self.uops:
@ -166,13 +170,13 @@ class AssemblyCodegen(Linearizer):
ins.append(AssemblyInstruction(UOps.COND_BRANCH, None, [pred], (f"$skipload_{skipload_branch}", False)))
if args.valid.max == 1:
# NOTE: you can't compute the index in here, because it assumes it's all available later
ins.append(AssemblyInstruction(UOps.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global', args.memory_dtype if buf_to_dtype[args.name] != dtypes.float else None))) #if args.i != -1 else 'shared')
ins.append(AssemblyInstruction(UOps.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global' if buf_index[args.name] != -1 else 'shared', args.memory_dtype if buf_to_dtype[args.name] != dtypes.float else None))) #if args.i != -1 else 'shared')
if args.valid.min == 0 and args.valid.max == 1:
ins.append(AssemblyInstruction(UOps.LABEL, None, [], f"$skipload_{skipload_branch}"))
skipload_branch += 1
elif uop == UOps.STORE:
idx, treg, off = addr_w_offset(args)
ins.append(AssemblyInstruction(UOps.STORE, None, [idx, tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global', args.memory_dtype if buf_to_dtype['data0'] != dtypes.float else None))) #if args.i != -1 else 'shared')
ins.append(AssemblyInstruction(UOps.STORE, None, [idx, tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if buf_index[args.name] != -1 else 'shared', args.memory_dtype if buf_to_dtype['data0'] != dtypes.float else None))) #if args.i != -1 else 'shared')
# define registers
ins = [AssemblyInstruction(UOps.DEFINE_REGISTER, None, [], (dtype, type_to_letter(dtype), c)) for dtype,c in cnts.items()] + ins

View file

@ -14,6 +14,7 @@ def compute_offsets(total):
def get_op(op): return f"bl {'_' if system() == 'Darwin' else ''}{op}"
class ARM64Codegen(AssemblyCodegen):
def hand_coded_optimizations(self): pass
def specialize(self, asm):
var_size = 0
prev_uop = None
@ -28,7 +29,7 @@ class ARM64Codegen(AssemblyCodegen):
def mov_imm(value, to):
# Manually move value into reg if vin[1] can't fit
if value > 65535:
if abs(value) > abs(65535) and abs(value) != float('inf'):
ins.append(f"movz w15, #{value & 0xffff}")
ins.append(f"movk w15, #{(value >> 16) & 0xffff}, lsl #16")
ins.append(f"sxtw {to}, w15")
@ -75,11 +76,11 @@ class ARM64Codegen(AssemblyCodegen):
ins.append(f"mov x15, {mem_vars[v.nm]}")
ins.append(f"ldr {rtor[v.nm]}, [sp, x15]")
if uop == UOps.DEFINE_GLOBAL:
if arg.startswith('data'):
if uop == UOps.SPECIAL:
if arg.startswith('buf'):
# data 8 to n into the stack
if int(arg[4:]) >= 8:
ins.append(f"ldr x15, [x19, #{(int(arg[4:]) - 8) * 8}]")
if int(arg[3:]) >= 8:
ins.append(f"ldr x15, [x19, #{(int(arg[3:]) - 8) * 8}]")
ins.append(f"mov {rtor[out.nm]}, x15")
elif uop == UOps.CAST:
if arg == BinaryOps.CMPEQ: