cleanup tests, bump caches

2026-06-24 02:14:17 +00:00 · 2025-08-19 21:08:57 -07:00
10 changed files with 33 additions and 37 deletions
--- a/.github/actions/setup-tinygrad/action.yml
+++ b/.github/actions/setup-tinygrad/action.yml
@ -121,7 +121,7 @@ runs:
        echo 'Acquire::GzipIndexes "true";' | sudo tee /etc/apt/apt.conf.d/gzip
        echo 'Acquire::http::Pipeline-Depth "5";' | sudo tee -a /etc/apt/apt.conf.d/99parallel
        echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' | sudo tee -a /etc/apt/apt.conf.d/99keep-debs
-    
+
    - name: Add OpenCL Repo
      if: inputs.opencl == 'true' && runner.os == 'Linux'
      shell: bash
@ -174,7 +174,7 @@ runs:
        if [[ "${{ inputs.llvm }}" == "true" ]]; then
          pkgs+=" libllvm20 clang-20 lld-20"
        fi
-        
+
        echo "pkgs=$pkgs" >> "$GITHUB_OUTPUT"
        echo "hash=$(echo -n "$pkgs" | sha256sum | cut -d' ' -f1)" >> "$GITHUB_OUTPUT"
@ -183,21 +183,21 @@ runs:
      uses: actions/cache@v4
      with:
        path: /var/cache/apt/archives/
-        key: ${{ runner.os }}-apt-${{ steps.apt-pkgs.outputs.hash }}
+        key: ${{ runner.os }}-apt-${{ steps.apt-pkgs.outputs.hash }}-${{ env.APT_CACHE_VERSION }}
    - name: Run apt Update + Install
      if: runner.os == 'Linux' && (inputs.opencl == 'true' || inputs.amd == 'true' || inputs.cuda == 'true' || inputs.webgpu == 'true' || inputs.llvm == 'true')
      shell: bash
      run: |
        sudo apt -qq update || true
-        
+
        # ******** do install ********
        if [[ -n "${{ steps.apt-pkgs.outputs.pkgs }}" ]]; then
          sudo apt-get -y --allow-unauthenticated --no-install-recommends install ${{ steps.apt-pkgs.outputs.pkgs }}
        fi
-        
+
        sudo chown -R $USER:$USER /var/cache/apt/archives/
-    
+
    # **** AMD ****
    - name: Setup AMD (Linux)
      if: inputs.amd == 'true' && runner.os == 'Linux'
@ -234,7 +234,7 @@ runs:
        cache-name: cache-gpuocelot-build
      with:
        path: ${{ github.workspace }}/gpuocelot/ocelot
-        key: ${{ runner.os }}-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-0
+        key: ${{ runner.os }}-gpuocelot-b16039dc940dc6bc4ea0a98380495769ff35ed99-rebuild-${{ env.BUILD_CACHE_VERSION }}
    - name: Clone/compile gpuocelot
      if: inputs.ocelot == 'true' && steps.cache-build.outputs.cache-hit != 'true'
      shell: bash
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -63,7 +63,7 @@ jobs:
    - name: Run model inference benchmark
      run: METAL=1 python3.11 test/external/external_model_benchmark.py
    - name: Test speed vs torch
-      run: BIG=2 MPS=1 python3.11 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
+      run: BIG=2 MPS=1 python3.11 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
    - name: Test tensor cores
      run: METAL=1 python3.11 test/test_linearizer.py TestLinearizer.test_tensor_cores TestLinearizer.test_tensor_cores_padded TestLinearizer.test_tensor_cores_padded_uops
    - name: Test AMX tensor cores
@ -187,7 +187,7 @@ jobs:
    - name: Run model inference benchmark
      run: NV=1 CAPTURE_PROCESS_REPLAY=0 NOCLANG=1 python3 test/external/external_model_benchmark.py
    - name: Test speed vs torch
-      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
+      run: NV=1 CAPTURE_PROCESS_REPLAY=0 HALF=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
    - name: Test speed vs theoretical
      run: NV=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
    - name: Test benchmark allreduce
@ -389,7 +389,7 @@ jobs:
    #- name: Test speed vs torch
    #  run: |
    #    python3 -c "import torch; print(torch.__version__)"
-    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/external/external_test_speed_v_torch.py | tee torch_speed.txt
+    #    LD_PRELOAD="/opt/rocm/lib/libhsa-runtime64.so" HSA=1 BIG=2 TORCHCUDA=1 python3 test/speed/external_test_speed_v_torch.py | tee torch_speed.txt
    - name: Test speed vs theoretical
      run: AMD=1 IGNORE_BEAM_CACHE=1 BEAM_DEBUG=1 DEBUG=1 python -m pytest -rA test/external/speed_v_theoretical.py --durations=20
    - name: Test tensor cores
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -1,8 +1,10 @@
 name: Unit Tests
 env:
  # increment this when downloads substantially change to avoid the internet
-  DOWNLOAD_CACHE_VERSION: '11'
+  DOWNLOAD_CACHE_VERSION: '12'
-  PYTHON_CACHE_VERSION: '2'
+  PYTHON_CACHE_VERSION: '3'
  APT_CACHE_VERSION: '1'
  BUILD_CACHE_VERSION: '1'
  CAPTURE_PROCESS_REPLAY: 1
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
@ -30,9 +32,9 @@ jobs:
    - name: External Benchmark Schedule
      run: PYTHONPATH="." python3 test/external/external_benchmark_schedule.py
    - name: Speed Test
-      run: LLVM=1 python3 test/external/external_test_speed_v_torch.py
+      run: LLVM=1 python3 test/speed/external_test_speed_v_torch.py
    - name: Speed Test (BEAM=2)
-      run: BEAM=2 LLVM=1 python3 test/external/external_test_speed_v_torch.py
+      run: BEAM=2 LLVM=1 python3 test/speed/external_test_speed_v_torch.py
  docs:
    name: Docs
--- a/test/device/test_ocl.py
+++ b/test/device/test_ocl.py
--- a/test/external/external_benchmark_kernel_launch.py
+++ b/test/external/external_benchmark_kernel_launch.py
@ -2,7 +2,7 @@ import time
 from tinygrad import Tensor, TinyJit, Device, Context
 from tinygrad.helpers import Profiling, Timing, GlobalCounters
-# python3 test/external/external_test_speed_v_torch.py TestSpeed.test_add_a
+# python3 test/speed/external_test_speed_v_torch.py TestSpeed.test_add_a
@TinyJit
 def plus(a:Tensor, b:Tensor): return a+b
--- a/test/speed/external_test_copy_speed.py
+++ b/test/speed/external_test_copy_speed.py
--- a/test/speed/external_test_device_speed.py
+++ b/test/speed/external_test_device_speed.py
--- a/test/external/external_test_speed_v_torch.py
+++ b/test/external/external_test_speed_v_torch.py
--- a/test/test_compile_failures.py
+++ b/test/test_compile_failures.py
@ -1,7 +1,10 @@
-import unittest
+import unittest, io
 from contextlib import redirect_stdout
 from tinygrad import Tensor, dtypes, Device
 from tinygrad.helpers import OSX
 from tinygrad.engine.realize import lower_schedule
 from tinygrad.device import is_dtype_supported
 from tinygrad.engine.realize import get_program
 class TestCompileFailures(unittest.TestCase):
  def compile(self, out:Tensor):
@ -14,5 +17,17 @@ class TestCompileFailures(unittest.TestCase):
  def test_add_max_uchar(self):
    self.compile((Tensor.empty(1024, dtype='uint8') + Tensor.empty(1024, dtype='uint8')).max())
 class TestDisassembly(unittest.TestCase):
  # TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
  @unittest.skipUnless(Device.DEFAULT in ("CPU",) and OSX, "m series cpus support fp16 arithmetic")
  def test_float16_alu(self):
    c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
    s = c.schedule()[-1]
    p = get_program(s.ast, Device[Device.DEFAULT].renderer)
    lib = Device[Device.DEFAULT].compiler.compile(p.src)
    out = io.StringIO()
    with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)
    assert "fcvt" not in out.getvalue()
 if __name__ == '__main__':
  unittest.main()
--- a/test/test_disassembly.py
+++ b/test/test_disassembly.py
@ -1,21 +0,0 @@
 import unittest, io
 from tinygrad import Tensor, dtypes
 from contextlib import redirect_stdout
 from tinygrad.device import Device
 from tinygrad.helpers import OSX
 from tinygrad.engine.realize import get_program
 class TestDisassembly(unittest.TestCase):
  # TODO: fails on llvm. llvm.LLVMGetHostCPUName() returns "generic"
  @unittest.skipUnless(Device.DEFAULT in ("CPU",) and OSX, "m series cpus support fp16 arithmetic")
  def test_float16_alu(self):
    c = Tensor([1], dtype=dtypes.float16) + Tensor([1], dtype=dtypes.float16)
    s = c.schedule()[-1]
    p = get_program(s.ast, Device[Device.DEFAULT].renderer)
    lib = Device[Device.DEFAULT].compiler.compile(p.src)
    out = io.StringIO()
    with redirect_stdout(out): Device[Device.DEFAULT].compiler.disassemble(lib)
    assert "fcvt" not in out.getvalue()
 if __name__ == "__main__":
  unittest.main()