refactor with ci

2026-06-24 02:14:17 +00:00 · 2023-07-31 22:34:29 -04:00 · 2023-07-31 22:34:29 -04:00 · 8e68014299
commit 8e68014299
parent b44463aa66
3 changed files with 141 additions and 121 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -12,33 +12,48 @@ jobs:
    timeout-minutes: 20

    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Cache pip
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: linting
-    - name: Install dependencies
-      run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Repo line count
-      run: python3 sz.py
-    - name: Lint with pylint
-      run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string='  ' **/*.py
-    - name: Lint with flake8
-      run: flake8 --statistics -j4
-    - name: Lint tinygrad with pylint
-      run: pylint tinygrad/
-    - name: Run mypy
-      run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
-    - name: Install SLOCCount
-      run: sudo apt install sloccount
-    - name: Check <5000 lines
-      run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: linting
+      - name: Install dependencies
+        run: pip install -e '.[linting,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Repo line count
+        run: python3 sz.py
+      - name: Lint with pylint
+        run: python -m pylint --disable=all -e W0311 -e C0303 --jobs=0 --indent-string='  ' **/*.py
+      - name: Lint with flake8
+        run: flake8 --statistics -j4
+      - name: Lint tinygrad with pylint
+        run: pylint tinygrad/
+      - name: Run mypy
+        run: mypy tinygrad/ --ignore-missing-imports --check-untyped-defs --explicit-package-bases --warn-unreachable
+      - name: Install SLOCCount
+        run: sudo apt install sloccount
+      - name: Check <5000 lines
+        run: sloccount tinygrad test examples extra; if [ $(sloccount tinygrad | sed -n 's/.*Total Physical Source Lines of Code (SLOC)[ ]*= \([^ ]*\).*/\1/p' | tr -d ',') -gt 5000 ]; then exit 1; fi
+
+  testarm64:
+    name: ARM64 Test
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Setup qemu
+        uses: docker/setup-qemu-action@v1
+        with:
+          platforms: all
+      - name: Install dependencies
+        run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Test Ops
+        run: ARM64=1 CLANG=1 python -m pytest -n=auto test/test_ops.py

  testcpuimagenet:
    name: CPU and ImageNet to C Tests
@ -46,35 +61,35 @@ jobs:
    timeout-minutes: 20

    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Cache pip
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: testing
-    - name: Install Dependencies
-      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Test Docs
-      run: python docs/abstractions.py
-    - name: Test Quickstart
-      run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python3 quickstart.py
-    - name: Run Pytest
-      run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
-    - name: Fuzz Test symbolic
-      run: python test/external/fuzz_symbolic.py
-    - name: Fuzz Test shapetracker
-      run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
-    - name: Compile EfficientNet to C
-      run: PYTHONPATH="." CLANG=1 python3 examples/compile_efficientnet.py > recognize.c
-    - name: Compile C to native
-      run: clang -O2 recognize.c -lm -o recognize
-    - name: Test EfficientNet
-      run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: testing
+      - name: Install Dependencies
+        run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Test Docs
+        run: python docs/abstractions.py
+      - name: Test Quickstart
+        run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && PYTHONPATH=. python3 quickstart.py
+      - name: Run Pytest
+        run: python -m pytest -n=auto test/ -k "not (test_efficientnet and models/test_train.py)"
+      - name: Fuzz Test symbolic
+        run: python test/external/fuzz_symbolic.py
+      - name: Fuzz Test shapetracker
+        run: PYTHONPATH="." python test/external/fuzz_shapetracker.py
+      - name: Compile EfficientNet to C
+        run: PYTHONPATH="." CLANG=1 python3 examples/compile_efficientnet.py > recognize.c
+      - name: Compile C to native
+        run: clang -O2 recognize.c -lm -o recognize
+      - name: Test EfficientNet
+        run: curl https://media.istockphoto.com/photos/hen-picture-id831791190 | ./recognize | grep hen

  testtorch:
    name: Torch Tests
@ -82,23 +97,23 @@ jobs:
    timeout-minutes: 20

    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.8
-    - name: Cache pip
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pip
-        key: testing
-    - name: Install Dependencies
-      run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Run Pytest
-      run: TORCH=1 python -m pytest -n=auto test/
-    - name: Run ONNX
-      run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.8
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: testing
+      - name: Install Dependencies
+        run: pip install -e '.[testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Run Pytest
+        run: TORCH=1 python -m pytest -n=auto test/
+      - name: Run ONNX
+        run: TORCH=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --tb=no --disable-warnings || true

  testopencl:
    strategy:
@ -158,33 +173,33 @@ jobs:
    timeout-minutes: 20

    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Set up Python 3.11
-      uses: actions/setup-python@v4
-      with:
-        python-version: 3.11
-    - name: Cache pip
-      uses: actions/cache@v3
-      with:
-        path: ~/Library/Caches/pip
-        key: metalwebgpu
-    - name: Install Dependencies
-      run: pip install -e '.[metal,webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
-    - name: Test LLaMA compile speed
-      run: PYTHONPATH="." METAL=1 python3 test/external/external_test_speed_llama.py
-    #- name: Run dtype test
-    #  run: DEBUG=4 METAL=1 python -m pytest test/test_dtype.py
-    # dtype test has issues on test_half_to_int8
-    - name: Run metal ops test
-      run: DEBUG=2 METAL=1 python -m pytest test/test_ops.py
-    - name: Run JIT test
-      run: DEBUG=2 METAL=1 python -m pytest test/test_jit.py
-    # TODO: why not testing the whole test/?
-    - name: Run webgpu pytest
-      run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
-    - name: Build WEBGPU Efficientnet
-      run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Set up Python 3.11
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Cache pip
+        uses: actions/cache@v3
+        with:
+          path: ~/Library/Caches/pip
+          key: metalwebgpu
+      - name: Install Dependencies
+        run: pip install -e '.[metal,webgpu,testing]' --extra-index-url https://download.pytorch.org/whl/cpu
+      - name: Test LLaMA compile speed
+        run: PYTHONPATH="." METAL=1 python3 test/external/external_test_speed_llama.py
+      #- name: Run dtype test
+      #  run: DEBUG=4 METAL=1 python -m pytest test/test_dtype.py
+      # dtype test has issues on test_half_to_int8
+      - name: Run metal ops test
+        run: DEBUG=2 METAL=1 python -m pytest test/test_ops.py
+      - name: Run JIT test
+        run: DEBUG=2 METAL=1 python -m pytest test/test_jit.py
+      # TODO: why not testing the whole test/?
+      - name: Run webgpu pytest
+        run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m pytest -n=auto -m 'webgpu'
+      - name: Build WEBGPU Efficientnet
+        run: WEBGPU=1 WGPU_BACKEND_TYPE=Metal python -m examples.webgpu.compile_webgpu

  testdocker:
    name: Docker Test
@ -192,12 +207,12 @@ jobs:
    if: ${{ false }}

    steps:
-    - name: Checkout Code
-      uses: actions/checkout@v3
-    - name: Build Docker
-      run: docker build -t tinygrad -f test/Dockerfile .
-    - name: Test Docker
-      run: docker run --rm tinygrad /usr/bin/env python3 -c "from tinygrad.tensor import Tensor; print(Tensor.eye(3).numpy())"
+      - name: Checkout Code
+        uses: actions/checkout@v3
+      - name: Build Docker
+        run: docker build -t tinygrad -f test/Dockerfile .
+      - name: Test Docker
+        run: docker run --rm tinygrad /usr/bin/env python3 -c "from tinygrad.tensor import Tensor; print(Tensor.eye(3).numpy())"

  tests:
    strategy:
@ -218,7 +233,7 @@ jobs:
      - name: Cache pip
        uses: actions/cache@v3
        with:
-          path: '~/.cache/pip'
+          path: "~/.cache/pip"
          key: ${{ matrix.backend }}
      - name: Set env
        run: printf "${{ matrix.backend == 'llvm' && 'ENABLE_METHOD_CACHE=1\nLLVM=1' || matrix.backend == 'clang' && 'CLANG=1\nENABLED_METHOD_CACHE=1' || matrix.backend == 'gpu' && 'GPU=1' || matrix.backend == 'cuda' && 'FORWARD_ONLY=1\nJIT=1\nOPT=2\nCUDA=1\nCUDACPU=1\n'}}" >> $GITHUB_ENV
@ -270,4 +285,4 @@ jobs:
        run: python -m pytest -n=auto test/ -k '${{matrix.backend=='llvm'&&'not (test_nn.py and test_conv_transpose2d)'||'test'}}' -m 'not exclude_${{matrix.backend}}'
      - name: Run pytest (cuda)
        if: matrix.backend=='cuda'
-        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
+        run: python -m pytest -n=auto test/ -k 'not (half or test_efficientnet_safetensors) and not (test_conv2d and test_tensor.py)' -m 'not exclude_cuda' --ignore=test/external --ignore=test/models
--- a/extra/assembly/assembly.py
+++ b/extra/assembly/assembly.py
@ -38,12 +38,16 @@ class AssemblyCodegen(Linearizer):
  # s registers are the addresses and non local indexes
  def codegen(self):
    self.process()
-    if not getenv("CLANG"): self.hand_coded_optimizations()
-    self.limit_global_dims(3)  # all GPU asms have 3 (for now)
+    self.hand_coded_optimizations()
+    if not getenv("CLANG"): 
+      self.limit_global_dims(3)  # all GPU asms have 3 (for now)
    self.linearize()

    cnts:DefaultDict[Tuple[DType, bool], int] = defaultdict(int)
    tor: Dict[Any, Register] = {}
+    buf_to_dtype = {args[0]:args[1] for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL}
+    buf_index = {x:i for i,x in enumerate(buf_to_dtype.keys())}
+    
    def newreg(tok, dtype=dtypes.float32, scalar=False):
      nonlocal cnts, tor
      if isinstance(tok, Token): dtype = tok.dtype  # this
@ -82,7 +86,7 @@ class AssemblyCodegen(Linearizer):
      AndNode: lambda self,ops,ctx: functools.reduce(lambda a,b: render_alu(BinaryOps.MUL, a, b.render(ops,ctx), dtype=dtypes.bool), self.nodes[1:], self.nodes[0].render(ops,ctx)) }

    def addr_w_offset(args):
-      idx = args.idx*args.memory_dtype.itemsize
+      idx = args.idx*buf_to_dtype[args.name].itemsize
      off = 0  # TODO: should this be None?
      if isinstance(idx, SumNode):
        nums = [n.b for n in idx.nodes if isinstance(n, NumNode)]
@ -95,15 +99,15 @@ class AssemblyCodegen(Linearizer):
          new_reg = newreg((reg.nm, 'vec'), dtype=reg.dtype)
          ins.append(AssemblyInstruction(UOps.ALU, new_reg, [reg], UnaryOps.NOOP))
          reg = new_reg
-        return tor[f"buf{args.i}"], reg, off
-      reg = render_alu(BinaryOps.ADD, render_cast(reg, dtypes.uint64), tor[args[0]], dtype=dtypes.uint64)
+        return tor[f"buf{buf_index[args.name]}"], reg, off
+      reg = render_alu(BinaryOps.ADD, render_cast(reg, dtypes.uint64), tor[f"buf{buf_index[args.name]}"], dtype=dtypes.uint64)
      return reg, None, off

    buf_to_dtype = {args[0]:args[1] for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL}
    buf_index = {x:i for i,x in enumerate(buf_to_dtype.keys())}

    ins = []
-    ins += [AssemblyInstruction(UOps.DEFINE_GLOBAL, newreg(args[0], dtype=dtypes.uint64, scalar=True), [], args[0]) for uop,_,_,args in self.uops if uop == UOps.DEFINE_GLOBAL]
+    ins += [AssemblyInstruction(UOps.SPECIAL, newreg(f"buf{i}", dtype=dtypes.uint64, scalar=True), [], f"buf{i}") for i in range(len(self.bufs))]
    global_size, local_size = [], []
    skipload_branch = 0
    for uop,newvar,vin,args in self.uops:
@ -166,13 +170,13 @@ class AssemblyCodegen(Linearizer):
              ins.append(AssemblyInstruction(UOps.COND_BRANCH, None, [pred], (f"$skipload_{skipload_branch}", False)))
          if args.valid.max == 1:
              # NOTE: you can't compute the index in here, because it assumes it's all available later
-              ins.append(AssemblyInstruction(UOps.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global', args.memory_dtype if buf_to_dtype[args.name] != dtypes.float else None))) #if args.i != -1 else 'shared')
+              ins.append(AssemblyInstruction(UOps.LOAD, reg, [idx] + ([treg] if treg is not None else []), (off, 'global' if buf_index[args.name] != -1 else 'shared', args.memory_dtype if buf_to_dtype[args.name] != dtypes.float else None))) #if args.i != -1 else 'shared')
          if args.valid.min == 0 and args.valid.max == 1:
            ins.append(AssemblyInstruction(UOps.LABEL, None, [], f"$skipload_{skipload_branch}"))
            skipload_branch += 1
      elif uop == UOps.STORE:
        idx, treg, off = addr_w_offset(args)
-        ins.append(AssemblyInstruction(UOps.STORE, None, [idx, tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global', args.memory_dtype if buf_to_dtype['data0'] != dtypes.float else None))) #if args.i != -1 else 'shared')
+        ins.append(AssemblyInstruction(UOps.STORE, None, [idx, tor[vin[0]]] + ([treg] if treg is not None else []), (off, 'global' if buf_index[args.name] != -1 else 'shared', args.memory_dtype if buf_to_dtype['data0'] != dtypes.float else None))) #if args.i != -1 else 'shared')

    # define registers
    ins = [AssemblyInstruction(UOps.DEFINE_REGISTER, None, [], (dtype, type_to_letter(dtype), c)) for dtype,c in cnts.items()] + ins
--- a/extra/assembly/assembly_arm64.py
+++ b/extra/assembly/assembly_arm64.py
@ -14,6 +14,7 @@ def compute_offsets(total):
 def get_op(op): return f"bl {'_' if system() == 'Darwin' else ''}{op}"

 class ARM64Codegen(AssemblyCodegen):
+  def hand_coded_optimizations(self): pass
  def specialize(self, asm):
    var_size = 0
    prev_uop = None
@ -28,7 +29,7 @@ class ARM64Codegen(AssemblyCodegen):

    def mov_imm(value, to):
        # Manually move value into reg if vin[1] can't fit
-        if value > 65535:
+        if abs(value) > abs(65535) and abs(value) != float('inf'):
          ins.append(f"movz w15, #{value & 0xffff}")
          ins.append(f"movk w15, #{(value >> 16) & 0xffff}, lsl #16")
          ins.append(f"sxtw {to}, w15")
@ -75,11 +76,11 @@ class ARM64Codegen(AssemblyCodegen):
        ins.append(f"mov x15, {mem_vars[v.nm]}")
        ins.append(f"ldr {rtor[v.nm]}, [sp, x15]")

-      if uop == UOps.DEFINE_GLOBAL:
-        if arg.startswith('data'):
+      if uop == UOps.SPECIAL:
+        if arg.startswith('buf'):
          # data 8 to n into the stack 
-          if int(arg[4:]) >= 8:
-            ins.append(f"ldr x15, [x19, #{(int(arg[4:]) - 8) * 8}]")
+          if int(arg[3:]) >= 8:
+            ins.append(f"ldr x15, [x19, #{(int(arg[3:]) - 8) * 8}]")
            ins.append(f"mov {rtor[out.nm]}, x15")
      elif uop == UOps.CAST:
        if arg == BinaryOps.CMPEQ: