Merge branch 'master' into nak

This commit is contained in:
Christopher Milan 2025-09-25 13:31:23 -07:00
commit 52c7fe80aa
151 changed files with 2173 additions and 1572 deletions

View file

@ -253,8 +253,13 @@ runs:
git checkout b16039dc940dc6bc4ea0a98380495769ff35ed99
mkdir build
cd build
cmake .. -Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF -DCMAKE_BUILD_ALWAYS=0 -DBUILD_TESTS_CUDA=OFF \
-DBoost_INCLUDE_DIR=$(brew --prefix boost)/include -DBoost_LIBRARY_DIR=$(brew --prefix boost)/lib -DCMAKE_POLICY_VERSION_MINIMUM=3.5
CMAKE_ARGS="-Wno-dev -G Ninja -DOCELOT_BUILD_TOOLS=OFF -DCMAKE_BUILD_ALWAYS=0 -DBUILD_TESTS_CUDA=OFF -DCMAKE_POLICY_VERSION_MINIMUM=3.5"
if [[ "${{ runner.os }}" == "macOS" ]]; then
CMAKE_ARGS="$CMAKE_ARGS -DBoost_INCLUDE_DIR=$(brew --prefix boost)/include -DBoost_LIBRARY_DIR=$(brew --prefix boost)/lib"
fi
cmake .. $CMAKE_ARGS
ninja
- name: Install gpuocelot
if: inputs.ocelot == 'true'

View file

@ -28,7 +28,7 @@ jobs:
# since sudo is required for usbgpu on macos, move the cache to a new location, as some of the files are owned by root
PYTHONPYCACHEPREFIX: /tmp/tiny_python_pycache
runs-on: [self-hosted, macOS]
timeout-minutes: 20
timeout-minutes: 60
defaults:
run:
shell: bash -e -o pipefail {0}
@ -160,7 +160,7 @@ jobs:
testnvidiabenchmark:
name: tinybox green Benchmark
runs-on: [self-hosted, Linux, tinyboxgreen]
timeout-minutes: 30
timeout-minutes: 60
defaults:
run:
shell: bash -e -o pipefail {0}
@ -197,14 +197,14 @@ jobs:
- name: Test tensor cores
run: |
NV=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
PTX=1 ALLOW_TF32=1 NV=1 python3 test/opt/test_tensor_cores.py
NV=1 NV_PTX=1 ALLOW_TF32=1 python3 test/opt/test_tensor_cores.py
- name: Run Tensor Core GEMM (CUDA)
run: |
CUDA=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul.txt
CUDA=1 SHOULD_USE_TC=1 BFLOAT16=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_bfloat16.txt
CUDA=1 SHOULD_USE_TC=1 ALLOW_TF32=1 DEBUG=2 ATOL=2e-2 python3 extra/gemm/simple_matmul.py | tee matmul_tf32.txt
- name: Run Tensor Core GEMM (PTX)
run: NV=1 PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt
run: NV=1 NV_PTX=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_ptx.txt
- name: Run Tensor Core GEMM (NV)
run: NV=1 SHOULD_USE_TC=1 HALF=1 DEBUG=2 python3 extra/gemm/simple_matmul.py | tee matmul_nv.txt
- name: Test NV=1
@ -274,7 +274,7 @@ jobs:
testmorenvidiabenchmark:
name: tinybox green Training Benchmark
runs-on: [self-hosted, Linux, tinyboxgreen]
timeout-minutes: 20
timeout-minutes: 60
defaults:
run:
shell: bash -e -o pipefail {0}
@ -302,7 +302,7 @@ jobs:
- name: Fuzz Padded Tensor Core GEMM (NV)
run: NV=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
- name: Fuzz Padded Tensor Core GEMM (PTX)
run: NV=1 PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
run: NV=1 NV_PTX=1 M_START=12 M_STOP=20 M_STEP=1 N_START=6 N_STOP=10 N_STEP=1 K_START=28 K_STOP=36 K_STEP=1 HALF=1 TC_OPT=2 python3 ./extra/gemm/fuzz_matmul.py
- name: Train MNIST
run: time PYTHONPATH=. NV=1 TARGET_EVAL_ACC_PCT=96.0 python3 examples/beautiful_mnist.py | tee beautiful_mnist.txt
- name: Run 10 CIFAR training steps
@ -346,7 +346,7 @@ jobs:
testamdbenchmark:
name: tinybox red Benchmark
runs-on: [self-hosted, Linux, tinybox]
timeout-minutes: 20
timeout-minutes: 60
defaults:
run:
shell: bash -e -o pipefail {0}
@ -476,7 +476,7 @@ jobs:
testmoreamdbenchmark:
name: tinybox red Training Benchmark
runs-on: [self-hosted, Linux, tinybox]
timeout-minutes: 30
timeout-minutes: 60
defaults:
run:
shell: bash -e -o pipefail {0}
@ -511,8 +511,8 @@ jobs:
run: BENCHMARK_LOG=cifar_10steps ASSERT_MIN_STEP_TIME=85 AMD=1 STEPS=10 python3 examples/hlb_cifar10.py | tee train_cifar.txt
- name: Run 10 CIFAR training steps w HALF
run: BENCHMARK_LOG=cifar_10steps_half ASSERT_MIN_STEP_TIME=188 AMD=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_half.txt
- name: Run 10 CIFAR training steps w BF16
run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
# - name: Run 10 CIFAR training steps w BF16
# run: BENCHMARK_LOG=cifar_10steps_bf16 ASSERT_MIN_STEP_TIME=288 AMD=1 STEPS=10 DEFAULT_FLOAT=BFLOAT16 python3 examples/hlb_cifar10.py | tee train_cifar_bf16.txt
- name: Run 10 CIFAR training steps w winograd
run: BENCHMARK_LOG=cifar_10steps_half_wino ASSERT_MIN_STEP_TIME=66 AMD=1 WINO=1 STEPS=10 DEFAULT_FLOAT=HALF python3 examples/hlb_cifar10.py | tee train_cifar_wino.txt
- name: Run full CIFAR training w 1 GPU
@ -539,7 +539,7 @@ jobs:
testmlperfamdbenchmark:
name: tinybox red MLPerf Benchmark
runs-on: [self-hosted, Linux, tinybox]
timeout-minutes: 30
timeout-minutes: 60
defaults:
run:
shell: bash -e -o pipefail {0}
@ -645,7 +645,7 @@ jobs:
testreddriverbenchmark:
name: AM Benchmark
runs-on: [self-hosted, Linux, tinyboxrandom]
timeout-minutes: 15
timeout-minutes: 20
defaults:
run:
shell: bash -e -o pipefail {0}
@ -716,7 +716,7 @@ jobs:
testgreendriverbenchmark:
name: NV Benchmark
runs-on: [self-hosted, Linux, tinyboxrandom]
timeout-minutes: 15
timeout-minutes: 20
defaults:
run:
shell: bash -e -o pipefail {0}

View file

@ -30,8 +30,6 @@ jobs:
key: llvm-speed
deps: testing_minimal
llvm: 'true'
- name: External Benchmark Schedule
run: python3 test/external/external_benchmark_schedule.py
- name: Speed Test
run: CPU=1 CPU_LLVM=1 python3 test/speed/external_test_speed_v_torch.py
- name: Speed Test (BEAM=2)
@ -48,7 +46,7 @@ jobs:
uses: ./.github/actions/setup-tinygrad
with:
deps: docs
pydeps: "capstone"
pydeps: "capstone torch"
- name: Build wheel and show size
run: |
pip install build
@ -79,6 +77,8 @@ jobs:
run: |
python docs/abstractions2.py
python docs/abstractions3.py
- name: Test README
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py && python README.py
- name: Test Quickstart
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' docs/quickstart.md > quickstart.py && python quickstart.py
- name: Test DEBUG
@ -93,8 +93,6 @@ jobs:
name: Torch Backend Tests
runs-on: ubuntu-latest
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -132,8 +130,6 @@ jobs:
name: Torch Backend Tests More
runs-on: ubuntu-latest
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -152,20 +148,28 @@ jobs:
- name: Test some torch tests (expect failure)
run: python3 -m pytest extra/torch_backend/torch_tests.py -v --tb=no || true
tc:
name: Tensor Core tests
bepython:
name: Python Backend
runs-on: ubuntu-latest
timeout-minutes: 10
env:
IGNORE_OOB: 0
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: uops-minimal
key: be-minimal
deps: testing_minimal
- name: Test dtype with Python emulator
run: DEBUG=1 PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py
- name: Test ops with Python emulator
run: DEBUG=2 SKIP_SLOW_TEST=1 PYTHON=1 python3 -m pytest -n=auto test/test_ops.py --durations=20
- name: Test uops with Python emulator
run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20
- name: Test symbolic with Python emulator
run: PYTHON=1 python3 test/test_symbolic_ops.py
- name: test_renderer_failures with Python emulator
run: PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures
- name: Test IMAGE=2 support
run: |
IMAGE=2 PYTHON=1 python3 test/test_ops.py TestOps.test_gemm
@ -212,31 +216,6 @@ jobs:
DEBUG=2 EMULATE=INTEL PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStatsMatmulHalf
DEBUG=2 AMX=1 EMULATE=AMX PYTHON=1 python3 ./test/test_uops_stats.py TestUOpsStats.test_simple_matmul
bepython:
name: Python Backend
runs-on: ubuntu-latest
timeout-minutes: 10
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: be-minimal
deps: testing_minimal
- name: Test dtype with Python emulator
run: DEBUG=1 PYTHON=1 python3 -m pytest -n=auto test/test_dtype.py test/test_dtype_alu.py
- name: Test ops with Python emulator
run: DEBUG=2 SKIP_SLOW_TEST=1 PYTHON=1 python3 -m pytest -n=auto test/test_ops.py --durations=20
- name: Test uops with Python emulator
run: PYTHON=1 python3 -m pytest test/test_uops.py --durations=20
- name: Test symbolic with Python emulator
run: PYTHON=1 python3 test/test_symbolic_ops.py
- name: test_renderer_failures with Python emulator
run: PYTHON=1 python3 -m pytest -rA test/test_renderer_failures.py::TestRendererFailures
linter:
name: Linters
runs-on: ubuntu-latest
@ -282,8 +261,6 @@ jobs:
key: unittest-12
pydeps: "pillow"
deps: testing_unit
- name: Test README
run: awk '/```python/{flag=1;next}/```/{flag=0}flag' README.md > README.py && python README.py
- name: Run unit tests
run: python -m pytest -n=auto test/unit/ --durations=20
- name: Run targetted tests on NULL backend
@ -295,6 +272,8 @@ jobs:
# run: NULL=1 python3 examples/llama.py --gen 1 --size 7B --shard 4 --prompt "Hello." --count 3 --temperature 0 --timing
- name: Run GC tests
run: python test/external/external_uop_gc.py
- name: External Benchmark Schedule
run: python3 test/external/external_benchmark_schedule.py
- name: Run process replay tests
uses: ./.github/actions/process-replay
- name: Regen dataset on test_tiny
@ -330,12 +309,10 @@ jobs:
- name: Fuzz Test shape ops
run: python test/external/fuzz_shape_ops.py
testgpuimage:
name: 'GPU IMAGE Tests'
testopenclimage:
name: CL IMAGE Tests
runs-on: ubuntu-22.04
timeout-minutes: 10
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -345,19 +322,17 @@ jobs:
key: gpu-image
deps: testing_minimal
opencl: 'true'
- name: Test GPU IMAGE=2 ops + training
- name: Test CL IMAGE=2 ops + training
run: |
GPU=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
GPU=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist
CL=1 IMAGE=2 python -m pytest -n=auto test/test_ops.py --durations=20
CL=1 IMAGE=2 python test/models/test_end2end.py TestEnd2End.test_linear_mnist
- name: Run process replay tests
uses: ./.github/actions/process-replay
testgpumisc:
name: 'GPU Misc tests'
name: CL Misc tests
runs-on: ubuntu-22.04
timeout-minutes: 10
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -368,11 +343,11 @@ jobs:
deps: testing_minimal
opencl: 'true'
- name: Generate Dataset
run: GPU=1 extra/optimization/generate_dataset.sh
run: CL=1 extra/optimization/generate_dataset.sh
- name: Run Kernel Count Test
run: GPU=1 python -m pytest -n=auto test/external/external_test_opt.py
run: CL=1 python -m pytest -n=auto test/external/external_test_opt.py
- name: Run fused optimizer tests
run: GPU=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py
run: CL=1 FUSE_OPTIM=1 python -m pytest -n=auto test/models/test_mnist.py
- name: Upload artifact
uses: actions/upload-artifact@v4
with:
@ -380,11 +355,9 @@ jobs:
path: /tmp/sops.gz
testopenpilot:
name: 'openpilot Compile Tests'
name: openpilot Compile Tests
runs-on: ubuntu-22.04
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -397,26 +370,26 @@ jobs:
llvm: 'true'
- name: Test openpilot model kernel count and gate usage
run: |
ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
ALLOWED_KERNEL_COUNT=208 ALLOWED_READ_IMAGE=2175 ALLOWED_GATED_READ_IMAGE=16 FLOAT16=0 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/v0.9.4/selfdrive/modeld/models/supercombo.onnx
- name: Test openpilot alt model correctness (float32)
run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/3799fe46b3a629e491d4b8498b8ae83e4c88c304/selfdrive/modeld/models/supercombo.onnx
- name: Test openpilot fastvits model correctness (float32)
run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
# - name: Test openpilot simple_plan vision model correctness (float32)
# run: FLOAT16=0 DEBUGCL=1 GPU=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b
# run: FLOAT16=0 DEBUGCL=1 CL=1 IMAGE=2 python examples/openpilot/compile3.py https://gitlab.com/commaai/openpilot-lfs.git/gitlab-lfs/objects/35ff4f4577002f2685e50c8346addae33fe8da27a41dd4d6a0f14d1f4b1af81b
- name: Test openpilot LLVM compile
run: CPU=1 CPU_LLVM=1 LLVMOPT=1 JIT=2 BEAM=0 IMAGE=0 python examples/openpilot/compile3.py https://github.com/commaai/openpilot/raw/9118973ed03c1ae1d40cf69a29507ec2cc78efd7/selfdrive/modeld/models/supercombo.onnx
- name: Test openpilot compile4
run: NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py
run: NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 DEBUG=2 python3 examples/openpilot/compile4.py
- name: Run process replay tests
uses: ./.github/actions/process-replay
# ****** ONNX Tests ******
testonnxcpu:
name: 'ONNX (CPU) Tests'
name: ONNX (CPU) Tests
runs-on: ubuntu-22.04
timeout-minutes: 20
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
@ -442,12 +415,9 @@ jobs:
uses: ./.github/actions/process-replay
testopencl:
name: 'ONNX (GPU)+Optimization Tests'
name: ONNX (CL)+Optimization Tests
runs-on: ubuntu-22.04
timeout-minutes: 20
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -459,16 +429,16 @@ jobs:
pydeps: "tensorflow==2.15.1 tensorflow_addons"
python-version: '3.11'
opencl: 'true'
- name: Test ONNX (GPU)
run: GPU=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test ONNX (CL)
run: CL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
#- name: Test Optimization Helpers
# run: DEBUG=1 python3 extra/optimization/test_helpers.py
#- name: Test Action Space
# run: DEBUG=1 GPU=1 python3 extra/optimization/get_action_space.py
# run: DEBUG=1 CL=1 python3 extra/optimization/get_action_space.py
- name: Test Beam Search
run: GPU=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
run: CL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
- name: Test MLPerf stuff
run: GPU=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20
run: CL=1 python -m pytest -n=auto test/external/external_test_optim.py test/external/external_test_losses.py test/external/external_test_metrics.py test/external/external_test_datasets.py --durations=20
- name: Test llama 3 training
run: MAX_BUFFER_SIZE=0 DEV=NULL SAMPLES=300 BS=8 SEQLEN=512 GRADIENT_ACC_STEPS=8 FAKEDATA=1 DEFAULT_FLOAT=bfloat16 OPTIM_DTYPE=bfloat16 LLAMA3_SIZE=1B MODEL=llama3 python3 examples/mlperf/model_train.py
- name: Run process replay tests
@ -488,12 +458,12 @@ jobs:
- name: Test 1B LLM
run: echo "What's a male chicken called? Answer with only one word." | MAX_BUFFER_SIZE=0 python3 -m tinygrad.apps.llm | grep -i rooster
# ****** Models Tests ******
testmodels:
name: Models (llvm+cpu+gpu)
runs-on: ubuntu-22.04
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -506,15 +476,35 @@ jobs:
llvm: 'true'
- name: Test models (llvm)
run: CPU=1 CPU_LLVM=1 python -m pytest -n=auto test/models --durations=20
- name: Test models (gpu)
run: GPU=1 python -m pytest -n=auto test/models --durations=20
- name: Test models (opencl)
run: CL=1 python -m pytest -n=auto test/models --durations=20
- name: Test models (cpu)
run: CPU=1 CPU_LLVM=0 python -m pytest -n=auto test/models --durations=20
- name: Run process replay tests
uses: ./.github/actions/process-replay
testrangeify:
name: Linux (rangeify)
testmetalmodels:
name: Models (metal)
runs-on: macos-14
timeout-minutes: 20
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: metal
deps: testing
python-version: '3.11'
- name: Test models (Metal)
run: METAL=1 python -m pytest -n=auto test/models --durations=20
- name: Test LLaMA compile speed
run: METAL=1 python test/external/external_test_speed_llama.py
# ****** Feature Tests ******
testrangeifycpu:
name: Linux (rangeify) CPU
runs-on: ubuntu-24.04
timeout-minutes: 15
steps:
@ -525,28 +515,80 @@ jobs:
with:
key: rangeify-minimal-llvm
deps: testing_minimal
opencl: 'true'
llvm: "true"
- name: Test CPU=1 RANGEIFY=1
# TODO: add more passing tests here
# test_symbolic_arange_sym_step is passing now
# test_threefry_doesnt_use_long is because there's a contig after the long now
# test_load_state_dict_sharded_model_dict_same_axis issue with multi
# test_instancenorm_3d is very slow
run: |
CPU=1 CPU_LLVM=0 RANGEIFY=1 python3 -m pytest -n auto --durations 20 \
-k "not test_symbolic_arange_sym_step and not test_threefry_doesnt_use_long" \
test/test_tiny.py test/test_rangeify.py test/test_ops.py test/test_tensor_variable.py \
test/test_outerworld_range.py test/test_sample.py test/test_randomness.py
-k "not test_load_state_dict_sharded_model_dict_same_axis and not test_instancenorm_3d" \
test/test_tiny.py test/test_rangeify.py test/test_ops.py test/test_symbolic_ops.py test/test_symbolic_jit.py test/test_tensor_variable.py \
test/test_outerworld_range.py test/test_randomness.py test/test_nn.py test/test_arange.py test/test_tensor.py test/test_optim.py \
test/test_setitem.py
- name: Test const folding
run: CPU=1 RANGEIFY=1 python3 -m pytest -n auto --durations 20 test/test_const_folding.py -k "not test_cast_padded and not TestReduceOpsConstFolding and not TestMultiConstFolding"
- name: Test multitensor
run: |
CPU=1 RANGEIFY=1 python3 test/test_multitensor.py TestMultiTensor.test_matmul_shard_1_1 TestMultiTensor.test_simple_add_W TestMultiTensor.test_simple_reduce \
TestMultiTensor.test_elementwise_dtype TestMultiTensor.test_shard_no_recompile TestHandleData.test_copied_to_device TestMultiRamUsage
CPU=1 RANGEIFY=1 python3 -m pytest test/test_multitensor.py::TestMultiAssign -k 'not (multi_assign_piece_noncontig or multi_assign_var_offset)'
CPU=1 RANGEIFY=1 python3 -m pytest -n=auto test/test_multitensor.py::TestMultiTensor test/unit/test_allreduce.py -k 'not const_folding'
- name: Test CPU=1 RANGEIFY=2
run: CPU=1 CPU_LLVM=0 RANGEIFY=2 python3 -m pytest -n auto test/test_tiny.py test/test_rangeify.py test/test_ops.py --durations 20
# slow (and still wrong on beautiful_mnist)
#- name: Test LLVM=1 RANGEIFY=1 (slow tests)
#- name: Test LLVM RANGEIFY=1 (slow tests)
# run: CPU=1 CPU_LLVM=1 RANGEIFY=1 python3 -m pytest -n auto test/models/test_mnist.py --durations 20
- name: Run process replay tests
uses: ./.github/actions/process-replay
testrangeifycl:
name: Linux (rangeify) CL
runs-on: ubuntu-24.04
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: rangeify-cl
deps: testing
opencl: 'true'
llvm: "true"
- name: Test CL=1 RANGEIFY=1
run: CL=1 RANGEIFY=1 pytest -n auto test/test_ops.py test/test_schedule.py test/test_symbolic_ops.py test/test_jit.py test/unit/test_disk_tensor.py test/models/test_mnist.py test/unit/test_mnist_dataset.py test/test_optim.py --durations 20
- name: Test Fuse
run: CL=1 RANGEIFY=2 python3 -m pytest --durations 20 test/test_softmax_fusion.py -k "not test_auto_softmax"
- name: Test ONNX
run: CL=1 RANGEIFY=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Run process replay tests
uses: ./.github/actions/process-replay
testrangeifymacos:
name: MacOS (rangeify)
runs-on: macos-14
timeout-minutes: 15
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: metal
deps: testing
- name: some unit tests
run: METAL=1 RANGEIFY=1 python -m pytest -n=auto test/unit/test_winograd.py --durations=20
- name: Test METAL=1 RANGEIFY=1
run: METAL=1 RANGEIFY=1 python -m pytest -n=auto test/test_ops.py --durations=20
- name: Run process replay tests
uses: ./.github/actions/process-replay
testdevectorize:
name: Linux (devectorize)
runs-on: ubuntu-24.04
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -568,8 +610,6 @@ jobs:
name: Linux (DSP)
runs-on: ubuntu-24.04
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -636,7 +676,6 @@ jobs:
runs-on: ubuntu-22.04
timeout-minutes: 20
env:
IGNORE_OOB: 0
AMD: 1
MOCKGPU: 1
FORWARD_ONLY: 1
@ -666,7 +705,7 @@ jobs:
run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
- name: Run TestOps.test_add with SQTT
run: |
PROFILE=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
VIZ=1 SQTT=1 DEBUG=5 python3 test/test_ops.py TestOps.test_add
extra/sqtt/rgptool.py create "/tmp/profile.pkl.$USER" -o /tmp/gpu0.rgp
- name: Run process replay tests
uses: ./.github/actions/process-replay
@ -694,7 +733,7 @@ jobs:
cuda: 'true'
ocelot: 'true'
- name: Set env
run: printf "${{ matrix.backend == 'PTX' && 'CUDA=1\nPTX=1' || matrix.backend == 'nv' && 'NV=1\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'PTX' && 'CUDA=1\nCUDA_PTX=1' || matrix.backend == 'nv' && 'NV=1\nSKIP_SLOW_TEST=1' }}" >> $GITHUB_ENV
- name: Check Device.DEFAULT and print some source
run: |
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CUDA','NV'], Device.DEFAULT"
@ -705,18 +744,15 @@ jobs:
- name: Run process replay tests
uses: ./.github/actions/process-replay
tests:
testcpuopencl:
strategy:
fail-fast: false
matrix:
backend: [llvm, cpu, gpu]
backend: [llvm, cpu, opencl]
name: Linux (${{ matrix.backend }})
runs-on: ubuntu-22.04
timeout-minutes: 20
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -725,13 +761,13 @@ jobs:
with:
key: ${{ matrix.backend }}-minimal
deps: testing_minimal
opencl: ${{ matrix.backend == 'gpu' && 'true' }}
opencl: ${{ matrix.backend == 'opencl' && 'true' }}
llvm: ${{ matrix.backend == 'llvm' && 'true' }}
- name: Set env
run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'gpu' && 'GPU=1' }}" >> $GITHUB_ENV
run: printf "${{ matrix.backend == 'llvm' && 'CPU=1\nCPU_LLVM=1' || matrix.backend == 'cpu' && 'CPU=1\nCPU_LLVM=0\nCPU_COUNT=2' || matrix.backend == 'opencl' && 'CL=1' }}" >> $GITHUB_ENV
- name: Check Device.DEFAULT and print some source
run: |
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','GPU'], Device.DEFAULT"
python3 -c "from tinygrad import Device; assert Device.DEFAULT in ['CPU','CL'], Device.DEFAULT"
DEBUG=5 FORWARD_ONLY=1 python3 test/test_ops.py TestOps.test_add
- name: Run pytest (${{ matrix.backend }})
run: python -m pytest -n=auto test/ --ignore=test/models --ignore=test/unit --durations=20
@ -772,7 +808,7 @@ jobs:
start_server "remote-server-amd-1" "AMD" 6667
start_server "remote-server-amd-2" "AMD" 6668
start_server "remote-server-gpu" "GPU" 7667
start_server "remote-server-gpu" "CL" 7667
start_server "remote-server-cpu" "CPU" 8667
- name: Check Device.DEFAULT and print some source
env:
@ -786,7 +822,7 @@ jobs:
HOST: 127.0.0.1:6667*6,127.0.0.1:6668*6
run: |
python3 -m pytest test/test_tiny.py test/test_jit.py test/test_subbuffer.py test/test_graph.py test/test_multitensor.py test/test_remote.py test/test_tensor_variable.py --durations 20
- name: Run REMOTE=1 Test (GPU)
- name: Run REMOTE=1 Test (CL)
env:
HOST: 127.0.0.1:7667*6
run: |
@ -807,48 +843,42 @@ jobs:
# ****** OSX Tests ******
testmetal2:
testmetal:
name: MacOS (unit)
runs-on: macos-14
timeout-minutes: 20
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
- name: Setup Environment
uses: ./.github/actions/setup-tinygrad
with:
key: metal2
key: metal
deps: testing
python-version: '3.11'
amd: 'true'
cuda: 'true'
ocelot: 'true'
llvm: 'true'
- name: Run real world test
run: METAL=1 python -m pytest -n=auto test/models/test_real_world.py --durations=20
- name: Test models (Metal)
run: METAL=1 python -m pytest -n=auto test/models -v --durations=20
- name: Run unit tests
run: METAL=1 python -m pytest -n=auto test/unit/ --durations=20
- name: Run ONNX
run: METAL=1 python -m pytest -n=auto test/external/external_test_onnx_backend.py --durations=20
- name: Test tensor core ops (fake)
run: TC=2 METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_gemm
run: METAL=1 DEBUG=3 TC=2 python test/test_ops.py TestOps.test_gemm
- name: Test tensor core ops (real)
run: METAL=1 DEBUG=3 python test/test_ops.py TestOps.test_big_gemm
- name: Test LLaMA compile speed
run: METAL=1 python test/external/external_test_speed_llama.py
- name: Test Beam Search
run: METAL=1 IGNORE_BEAM_CACHE=1 python3 -m pytest extra/optimization/test_beam_search.py
#- name: Fuzz Test linearizer
# run: METAL=1 DEPTH=4 FUZZ_N=50 FUZZ_MAX_SIZE=1000000 python test/external/fuzz_linearizer.py
- name: Run TRANSCENDENTAL math
run: TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
run: METAL=1 TRANSCENDENTAL=2 python -m pytest -n=auto test/test_ops.py::TestOps::test_sin test/test_ops.py::TestOps::test_cos test/test_ops.py::TestOps::test_tan test/test_ops.py::TestOps::test_exp test/test_ops.py::TestOps::test_log --durations=20
- name: Run pytest (amd)
env:
MOCKGPU: 1
AMD: 1
AMD_LLVM: 0
FORWARD_ONLY: 1
run: |
python3 -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py --durations=20
@ -856,13 +886,14 @@ jobs:
env:
MOCKGPU: 1
AMD: 1
AMD_LLVM: 1
FORWARD_ONLY: 1
run: |
python -m pytest -n=auto test/device/test_hcq.py test/test_tiny.py test/device/test_amd_llvm.py --durations=20
- name: Run pytest (ptx)
env:
MOCKGPU: 1
PTX: 1
NV_PTX: 1
NV: 1
FORWARD_ONLY: 1
run: |
@ -939,8 +970,6 @@ jobs:
name: MacOS (${{ matrix.backend }})
runs-on: macos-15
timeout-minutes: 20
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4
@ -976,8 +1005,6 @@ jobs:
name: Windows (${{ matrix.backend }})
runs-on: windows-latest
timeout-minutes: 15
env:
IGNORE_OOB: 0
steps:
- name: Checkout Code
uses: actions/checkout@v4

View file

@ -79,7 +79,7 @@ See [examples/beautiful_mnist.py](examples/beautiful_mnist.py) for the full vers
tinygrad already supports numerous accelerators, including:
- [x] [GPU (OpenCL)](tinygrad/runtime/ops_gpu.py)
- [x] [OpenCL](tinygrad/runtime/ops_cl.py)
- [x] [CPU](tinygrad/runtime/ops_cpu.py)
- [x] [METAL](tinygrad/runtime/ops_metal.py)
- [x] [CUDA](tinygrad/runtime/ops_cuda.py)

View file

@ -3,7 +3,7 @@
This is a list of environment variable that control the runtime behavior of tinygrad and its examples.
Most of these are self-explanatory, and are usually used to set an option at runtime.
Example: `GPU=1 DEBUG=4 python3 -m pytest`
Example: `CL=1 DEBUG=4 python3 -m pytest`
However you can also decorate a function to set a value only inside that function.
@ -31,7 +31,7 @@ These control the behavior of core tinygrad even when used as a library.
Variable | Possible Value(s) | Description
---|---|---
DEBUG | [1-7] | enable debugging output (operations, timings, speed, generated code and more)
GPU | [1] | enable the GPU (OpenCL) backend
CL | [1] | enable OpenCL backend
CUDA | [1] | enable CUDA backend
AMD | [1] | enable AMD backend
NV | [1] | enable NV backend
@ -41,8 +41,6 @@ BEAM | [#] | number of beams in kernel beam search
DEFAULT_FLOAT | [HALF, ...]| specify the default float dtype (FLOAT32, HALF, BFLOAT16, FLOAT64, ...), default to FLOAT32
IMAGE | [1-2] | enable 2d specific optimizations
FLOAT16 | [1] | use float16 for images instead of float32
PTX | [1] | enable the specialized [PTX](https://docs.nvidia.com/cuda/parallel-thread-execution/) assembler for Nvidia GPUs. If not set, defaults to generic CUDA codegen backend.
PROFILE | [1] | enable profiling. This feature is supported in NV, AMD, QCOM and METAL backends.
VISIBLE_DEVICES | [list[int]]| restricts the NV/AMD devices that are available. The format is a comma-separated list of identifiers (indexing starts with 0).
JIT | [0-2] | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled
VIZ | [1] | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz)

View file

@ -2,17 +2,17 @@
tinygrad supports various runtimes, enabling your code to scale across a wide range of devices. The default runtime can be automatically selected based on the available hardware, or you can force a specific runtime to be default using environment variables (e.g., `CPU=1`).
| Runtime | Description | Requirements |
|---------|-------------|--------------|
| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | Ampere/Ada series GPUs |
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | RDNA2/RDNA3/RDNA4 series GPUs. You can select one of the interfaces for communication by setting `AMD_IFACE=(KFD|PCI)`. See [AMD interfaces](#amd-interfaces) for more details. |
| [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | 6xx series GPUs |
| [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | M1+ Macs; Metal 3.0+ for `bfloat` support |
| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | NVIDIA GPU with CUDA support |
| [GPU (OpenCL)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_gpu.py) | Accelerates computations using OpenCL on GPUs | OpenCL 2.0 compatible device |
| [CPU (C Code)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang compiler | `clang` compiler in system `PATH` |
| [LLVM (LLVM IR)](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_llvm.py) | Runs on CPU using the LLVM compiler infrastructure | llvm libraries installed and findable |
| [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | Dawn library installed and findable. Download binaries [here](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0). |
| Runtime | Description | Compiler Options | Requirements |
|---------|-------------|------------------|--------------|
| [NV](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_nv.py) | Provides acceleration for NVIDIA GPUs | nvrtc (default)<br>PTX (`NV_PTX=1`) | Ampere/Ada/Blackwell series GPUs.<br>You can select an interface via `NV_IFACE=(NVK\|PCI)`. See [NV interfaces](#nv-interfaces) for details. |
| [AMD](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_amd.py) | Provides acceleration for AMD GPUs | LLVM (`AMD_LLVM=1`)<br>HIP/COMGR (`AMD_HIP=1`) | RDNA2 or newer GPUs.<br>You can select an interface via `AMD_IFACE=(KFD\|PCI\|USB)`. See [AMD interfaces](#amd-interfaces) for details. |
| [QCOM](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_qcom.py) | Provides acceleration for QCOM GPUs | - | 6xx series GPUs |
| [METAL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_metal.py) | Utilizes Metal for acceleration on Apple devices | - | M1+ Macs; Metal 3.0+ for `bfloat` support |
| [CUDA](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cuda.py) | Utilizes CUDA for acceleration on NVIDIA GPUs | nvrtc (default)<br> PTX (`CUDA_PTX=1`) | NVIDIA GPU with CUDA support |
| [CL](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cl.py) | Accelerates computations using OpenCL on GPUs | - | OpenCL 2.0 compatible device |
| [CPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_cpu.py) | Runs on CPU using the clang or llvm compiler | Clang JIT (default)<br>LLVM IR (`CPU_LLVM=1`) | `clang` compiler in system `PATH` |
| [WEBGPU](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/ops_webgpu.py) | Runs on GPU using the Dawn WebGPU engine (used in Google Chrome) | - | Dawn library installed and discoverable. Binaries: [pydawn v0.3.0](https://github.com/wpmed92/pydawn/releases/tag/v0.3.0) |
## Interoperability
@ -70,5 +70,12 @@ AMD backend supports several interfaces for communicating with devices:
* `KFD`: uses the amdgpu driver
* `PCI`: uses the [AM driver](developer/am.md)
* `USB`: USB3 interafce for asm24xx chips.
You can force an interface by setting `AMD_IFACE` to one of these values. In the case of `AMD_IFACE=PCI`, this may unbind your GPU from the amdgpu driver.
## NV Interfaces
NV backend supports several interfaces for communicating with devices:
* `NVK`: uses the nvidia driver
* `PCI`: uses the [NV driver](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/runtime/support/nv/nvdev.py)

View file

@ -189,7 +189,7 @@ class GPT2:
GlobalCounters.reset()
if timing: print("")
st = GlobalCounters.time_sum_s
with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
with Timing("ran model in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=timing):
with WallTimeEvent(BenchEvent.STEP):

View file

@ -478,7 +478,7 @@ After you are done speaking, output [EOS]. You are not Chad.
with Profiling(enabled=args.profile):
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
with WallTimeEvent(BenchEvent.STEP):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
tok_tensor = llama.model(next_tok, start_pos, args.temperature)

View file

@ -441,7 +441,7 @@ if __name__ == "__main__":
with Profiling(enabled=args.profile):
with Timing("total ", on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
with WallTimeEvent(BenchEvent.STEP):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None):
tok = model(Tensor([[last_tok]], device=device), start_pos, TEMPERATURE, TOP_K, TOP_P, ALPHA_F, ALPHA_P)
@ -479,7 +479,7 @@ if __name__ == "__main__":
st = GlobalCounters.time_sum_s
with Profiling(enabled=args.profile):
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "")+
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "")+
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB"+
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):

View file

@ -279,9 +279,15 @@ def generate(model, tokenizer, prompt: str, n_tokens_to_gen: int = 10, temp: boo
# Loading in the prompt tokens
logits = model.forward(Tensor([tks]))[:, -1, :]
for _ in tqdm(range(n_tokens_to_gen), desc="Speed Gen"):
# TODO: topk
if sample:
tok_Tens = (logits/temp).softmax().multinomial()
scaled_logits = logits / temp
if top_k is not None:
topk_values, topk_indices = scaled_logits.topk(top_k)
filtered_logits = Tensor.full_like(scaled_logits, -float("inf"))
filtered_logits = filtered_logits.scatter(dim=-1, index=topk_indices, src=topk_values)
tok_Tens = filtered_logits.softmax().multinomial()
else:
tok_Tens = scaled_logits.softmax().multinomial()
else:
tok_Tens = logits.argmax(axis=-1).unsqueeze(0)
tok = tok_Tens.item()
@ -298,6 +304,7 @@ if __name__ == "__main__":
parser.add_argument("--size", type=str, default="370m",
help=f"Size of model to use [{', '.join([k for k in MODELS.keys()])}]")
parser.add_argument("--n_tokens", type=int, default=10, help="Number of tokens to generate")
parser.add_argument("--top_k", type=int, help="Limit sampling to the top k most likely tokens")
parser.add_argument("--sample", dest="sample", action="store_true", help="Sample flag")
parser.add_argument("--temp", type=float, default=1.0, help="Sampling temp has to be <=1.0")
args = parser.parse_args()
@ -308,8 +315,9 @@ if __name__ == "__main__":
num_toks = args.n_tokens
sample = args.sample
temp = args.temp
top_k = args.top_k
s = time.time()
tinyoutput = generate(model, tokenizer, prompt, n_tokens_to_gen=num_toks, sample=sample, temp=temp)
tinyoutput = generate(model, tokenizer, prompt, n_tokens_to_gen=num_toks, sample=sample, temp=temp, top_k=top_k)
print(tinyoutput)
print('TIME: ', time.time() - s)
TORCHOUTPUT = "Why is gravity \nso important?\nBecause it's the only"

View file

@ -0,0 +1,57 @@
#!/usr/bin/env bash
# adapted from https://github.com/mlcommons/training/blob/4bdf5c8ed218ad76565a2ba1ac27c919ccc6d689/stable_diffusion/README.md
# setup dirs
DATA=/raid/datasets/stable_diffusion
LAION=$DATA/laion-400m/webdataset-moments-filtered
COCO=$DATA/coco2014
mkdir -p $LAION $COCO
CKPT=/raid/weights/stable_diffusion
mkdir -p $CKPT/clip $CKPT/sd $CKPT/inception
# download data
# if rclone isn't installed system-wide / in your PATH, put the executable path in quotes below
#RCLONE=""
RCLONE="rclone"
## VAE-encoded image latents, from 6.1M image subset of laion-400m
## about 1 TB for whole download
$RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/ ${LAION} --include="*.tar" -P
$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/laion-400m/moments-webdataset-filtered/sha512sums.txt ${LAION} -P
cd $LAION && grep -E '\.tar$' sha512sums.txt | sha512sum -c --quiet - && \
echo "All .tar files verified" || { echo "Checksum failure when validating downloaded Laion moments"; exit 1; }
## prompts and FID statistics from 30k image subset of coco2014
## 33 MB
$RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k.tsv ${COCO} -P
$RCLONE config create mlc-training s3 provider=Cloudflare access_key_id=76ea42eadb867e854061a1806220ee1e secret_access_key=a53625c4d45e3ca8ac0df8a353ea3a41ffc3292aa25259addd8b7dc5a6ce2936 endpoint=c2686074cb2caf5cbaf6d134bdba8b47.r2.cloudflarestorage.com
$RCLONE copy mlc-training:mlcommons-training-wg-public/stable_diffusion/datasets/coco2014/val2014_30k_stats.npz ${COCO} -P
# download checkpoints
## clip (needed for text and vision encoders for validation)
CLIP_WEIGHTS_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/resolve/main/open_clip_pytorch_model.bin"
CLIP_WEIGHTS_SHA256="9a78ef8e8c73fd0df621682e7a8e8eb36c6916cb3c16b291a082ecd52ab79cc4"
CLIP_CONFIG_URL="https://huggingface.co/laion/CLIP-ViT-H-14-laion2B-s32B-b79K/raw/main/open_clip_config.json"
wget -N -P ${CKPT}/clip ${CLIP_WEIGHTS_URL}
wget -N -P ${CKPT}/clip ${CLIP_CONFIG_URL}
echo "${CLIP_WEIGHTS_SHA256} ${CKPT}/clip/open_clip_pytorch_model.bin" | sha256sum -c
## sd (needed for latent->image decoder for validation, also has clip text encoder for training)
SD_WEIGHTS_URL='https://huggingface.co/stabilityai/stable-diffusion-2-base/resolve/main/512-base-ema.ckpt'
SD_WEIGHTS_SHA256="d635794c1fedfdfa261e065370bea59c651fc9bfa65dc6d67ad29e11869a1824"
wget -N -P ${CKPT}/sd ${SD_WEIGHTS_URL}
echo "${SD_WEIGHTS_SHA256} ${CKPT}/sd/512-base-ema.ckpt" | sha256sum -c
## inception (needed for validation)
FID_WEIGHTS_URL='https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth'
FID_WEIGHTS_SHA1="bd836944fd6db519dfd8d924aa457f5b3c8357ff"
wget -N -P ${CKPT}/inception ${FID_WEIGHTS_URL}
echo "${FID_WEIGHTS_SHA1} ${CKPT}/inception/pt_inception-2015-12-05-6726825d.pth" | sha1sum -c

View file

@ -6,7 +6,7 @@ from tinygrad.schedule.kernelize import get_kernelize_map
from tinygrad.engine.schedule import create_schedule_with_vars
from tinygrad.engine.realize import run_schedule
# NOLOCALS=1 GPU=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py
# NOLOCALS=1 CL=1 IMAGE=2 FLOAT16=1 VIZ=1 DEBUG=2 python3 examples/openpilot/compile4.py
OPENPILOT_MODEL = sys.argv[1] if len(sys.argv) > 1 else "https://github.com/commaai/openpilot/raw/v0.9.7/selfdrive/modeld/models/supercombo.onnx"
OUTPUT = sys.argv[2] if len(sys.argv) > 2 else "/tmp/openpilot.pkl"

View file

@ -8,7 +8,7 @@ from typing import Dict, Union
from extra.models.llama import Transformer, convert_from_huggingface, fix_bf16
from examples.llama3 import load
from tinygrad import nn, Tensor
from tinygrad import nn, Tensor, Device
from tinygrad.helpers import fetch, colored, GlobalCounters, Timing, DEBUG
from tinygrad.nn.state import load_state_dict, get_parameters
@ -80,7 +80,7 @@ if __name__ == "__main__":
st = GlobalCounters.time_sum_s
next_tok = Tensor([toks[start_pos:]]) if tok_tensor is None or (len(toks)-start_pos) > 1 else tok_tensor.reshape(1, 1)
with Timing("total ", enabled=args.timing, on_exit=lambda x: f", {1e9/x:.2f} tok/s, {GlobalCounters.global_mem/x:.2f} GB/s, param {param_bytes/x:.2f} GB/s"):
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on GPU" if DEBUG>=2 else "") +
with Timing("enqueue in ", on_exit=(lambda et: (f", {(GlobalCounters.time_sum_s-st)*1e3:.2f} ms on {Device.DEFAULT}" if DEBUG>=2 else "") +
f", {GlobalCounters.global_ops*1e-9:.2f} GOPS, {GlobalCounters.global_mem*1e-9:.2f} GB" +
(f", {GlobalCounters.global_mem*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s, param {param_bytes*1e-9/(GlobalCounters.time_sum_s-st):.2f} GB/s" if DEBUG>=2 else "")) if DEBUG else None, enabled=args.timing):
tok_tensor = transformer(next_tok, start_pos, args.temperature)

View file

@ -437,8 +437,8 @@ if __name__ == "__main__":
im.show()
# validation!
if args.prompt == default_prompt and args.steps == 10 and args.seed == 0 and args.guidance == 6.0 and args.width == args.height == 1024 \
and not args.weights:
is_default = args.prompt == default_prompt and args.steps == 10 and args.seed == 0 and args.guidance == 6.0 and args.width == args.height == 1024
if is_default and not args.weights and not args.fakeweights:
ref_image = Tensor(np.array(Image.open(Path(__file__).parent / "sdxl_seed0.png")))
distance = (((x.cast(dtypes.float) - ref_image.cast(dtypes.float)) / ref_image.max())**2).mean().item()
assert distance < 4e-3, colored(f"validation failed with {distance=}", "red")

View file

@ -109,7 +109,7 @@ class TextDecoder:
def forward(self, x:Tensor, pos:Union[Variable, Literal[0]], encoded_audio:Tensor):
seqlen = x.shape[-1]
x = self.token_embedding(x) + self.positional_embedding.shrink(((pos, pos+seqlen), None, None))
x = self.token_embedding(x) + self.positional_embedding.shrink(((pos, pos+seqlen), None))
for block in self.blocks: x = block(x, xa=encoded_audio, mask=self.mask, len=pos)
return self.output_tok(x)

View file

@ -1,7 +1,7 @@
# copying the kernels from https://github.com/microsoft/ArchProbe into Python
import numpy as np
import pickle
from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer
from tinygrad.runtime.ops_cl import CLProgram, CLBuffer
from tinygrad import dtypes
from tqdm import trange, tqdm
from matplotlib import pyplot as plt

View file

@ -4,7 +4,7 @@ from tinygrad import dtypes
from tinygrad.codegen.assembly import AssemblyCodegen, Register
from tinygrad.codegen.opt.kernel import Ops
from tinygrad.uop.ops import BinaryOps, UnaryOps, TernaryOps
from tinygrad.runtime.ops_gpu import ROCM_LLVM_PATH
from tinygrad.runtime.ops_cl import ROCM_LLVM_PATH
# ugh, is this really needed?
from extra.helpers import enable_early_exec

View file

@ -5,7 +5,7 @@ from tinygrad.helpers import colored
from extra.helpers import enable_early_exec
early_exec = enable_early_exec()
from tinygrad.runtime.ops_gpu import CLProgram, CLBuffer, ROCM_LLVM_PATH
from tinygrad.runtime.ops_cl import CLProgram, CLBuffer, ROCM_LLVM_PATH
ENABLE_NON_ASM = False

View file

@ -10,7 +10,7 @@ from tinygrad.uop.ops import Ops
import json
from collections import OrderedDict
EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "GPU"]
EXPORT_SUPPORTED_DEVICE = ["WEBGPU", "CPU", "CUDA", "CL"]
def compile_net(run:TinyJit, special_names:Dict[int,str]) -> Tuple[Dict[str,str],List[Tuple[str,List[str],List[int]]],Dict[str,Tuple[int,DType,int]],Dict[str,Tensor]]:
functions, bufs, bufs_to_save, statements, bufnum = {}, {}, {}, [], 0

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
import numpy as np
from tinygrad.runtime.ops_gpu import CLProgram, CLCompiler
from tinygrad.runtime.ops_cl import CLProgram, CLCompiler
from tinygrad import Device, dtypes
from tinygrad.device import Buffer
from hexdump import hexdump
@ -11,7 +11,7 @@ from hexdump import hexdump
# https://registry.khronos.org/OpenCL/extensions/intel/cl_intel_subgroup_split_matrix_multiply_accumulate.html
# https://hc34.hotchips.org/assets/program/conference/day1/GPU%20HPC/Intel_s%20Ponte%20Vecchio%20GPU%20-%20Architecture%20Systems%20and%20Software%20FINAL.pdf
device = Device["GPU"]
device = Device["CL"]
# NOTE: only the subgroup type 8 ones work
prog = CLProgram(device, "test", CLCompiler(device, "test").compile(f"""
@ -26,9 +26,9 @@ __kernel void test(__global float* data0, const __global int* data1, const __glo
"""))
#with open("/tmp/test.elf", "wb") as f: f.write(prog.lib)
a = Buffer("GPU", 8, dtypes.float32).allocate()
b = Buffer("GPU", 0x10, dtypes.float16).allocate()
c = Buffer("GPU", 8*0x10, dtypes.float16).allocate()
a = Buffer("CL", 8, dtypes.float32).allocate()
b = Buffer("CL", 0x10, dtypes.float16).allocate()
c = Buffer("CL", 8*0x10, dtypes.float16).allocate()
row = np.array([1,2,3,4,5,6,7,8,1,2,3,4,5,6,7,8], np.float16)
mat = np.random.random((8, 0x10)).astype(np.float16)

View file

@ -75,7 +75,7 @@ if __name__ == "__main__":
if GEMM_VARIATION == "max" and (M%64)==0 and (N%128)==0 and (K%64)==0 and DTYPE_IN == dtypes.half and DTYPE_OUT == dtypes.float and DTYPE_ACC == dtypes.float:
print("Using CUDA and triton-generated kernel")
# See nv_triton_gemm.annotated.ptx for PTX code which was generated from `PYTHONPATH=. DEBUG=6 CUDA=1 PTX=1 python3 extra/gemm/triton_nv_matmul.py`
# See nv_triton_gemm.annotated.ptx for PTX code which was generated from `PYTHONPATH=. DEBUG=6 CUDA=1 CUDA_PTX=1 python3 extra/gemm/triton_nv_matmul.py`
# this kernel with M=N=K=4096 does 162TFLOPS, vs torch at 144TFLOPS and BEAM=8 tinygrad at 138TFLOPS. theo max is 165TFLOPS.
# WMMA element size is (M, N, K) = (16, 8, 16)

View file

@ -43,7 +43,7 @@ def matmul_kernel(c_ptr, a_ptr, b_ptr, BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N:
c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :]
tl.store(c_ptrs, c)
# CUDA=1 PTX=1 python3 extra/gemm/triton_nv_matmul.py
# CUDA=1 CUDA_PTX=1 python3 extra/gemm/triton_nv_matmul.py
if __name__ == "__main__":
BLOCK_SIZE_M, BLOCK_SIZE_N, BLOCK_SIZE_K = 64, 128, 64
M, N, K = 4096, 4096, 4096

View file

@ -270,8 +270,10 @@ class FidInceptionV3:
self.Mixed_7b = inception.Mixed_7b
self.Mixed_7c = inception.Mixed_7c
def load_from_pretrained(self):
state_dict = torch_load(str(fetch("https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth", "pt_inception-2015-12-05-6726825d.pth")))
def load_from_pretrained(self, path=None):
if path is None:
path = fetch("https://github.com/mseitzer/pytorch-fid/releases/download/fid_weights/pt_inception-2015-12-05-6726825d.pth", "pt_inception-2015-12-05-6726825d.pth")
state_dict = torch_load(str(path))
for k,v in state_dict.items():
if k.endswith(".num_batches_tracked"):
state_dict[k] = v.reshape(1)

View file

@ -272,4 +272,4 @@ def compare_launch_state(states, good_states):
return True, "PASS"
# IOCTL=1 PTX=1 CUDA=1 python3 test/test_ops.py TestOps.test_tiny_add
# IOCTL=1 CUDA=1 CUDA_PTX=1 python3 test/test_ops.py TestOps.test_tiny_add

View file

@ -7,7 +7,7 @@ rm $LOGOPS
test/external/process_replay/reset.py
CI=1 python3 -m pytest -n=auto test/test_ops.py test/test_nn.py test/test_winograd.py test/models/test_real_world.py --durations=20
GPU=1 python3 -m pytest test/test_tiny.py
CL=1 python3 -m pytest test/test_tiny.py
# extract, sort and uniq
extra/optimization/extract_dataset.py

View file

@ -50,7 +50,7 @@ class TestBeamSearch(unittest.TestCase):
def test_variable_shrink_prime_number(self):
v = Variable("v", 1, 400).bind(367)
a = rand(400, 367)
b = (a.shrink(((0,v), None))+1).reshape(367,367).realize()
b = (a.shrink(((0,v), None))+1)[:367,:367].realize()
np.testing.assert_allclose(b.numpy(), a.numpy()[:367]+1, atol=1e-4, rtol=1e-4)
def test_no_mutate_rawbuffers(self):

View file

@ -1,6 +1,6 @@
import ctypes, array
from hexdump import hexdump
from tinygrad.runtime.ops_gpu import GPUDevice
from tinygrad.runtime.ops_cl import CLDevice
from tinygrad.helpers import getenv, to_mv, mv_address
from tinygrad.dtype import dtypes
from tinygrad import Tensor, TinyJit
@ -8,7 +8,7 @@ from tinygrad.runtime.autogen import opencl as cl
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
# create raw opencl buffer.
gdev = GPUDevice()
gdev = CLDevice()
cl_buf = cl.clCreateBuffer(gdev.context, cl.CL_MEM_READ_WRITE, 0x100, None, status := ctypes.c_int32())
assert status.value == 0

View file

@ -4,7 +4,7 @@
Only supported on 7900XTX, requires either AM (`rmmod amdgpu`) or disabling power gating on AMD (`ppfeaturemask=0xffff3fff`, don't forget to rebuild initramfs)
SQTT is implemented on top of normal tinygrad PROFILE=1, `PROFILE=1 SQTT=1` to get profile pickle with sqtt data embedded in it.
SQTT is implemented on top of normal tinygrad profiling, `VIZ=1 SQTT=1` to get profile pickle with sqtt data embedded in it.
`SQTT_BUFFER_SIZE=X` to change size of SQTT buffer (per shader engine, 6 SEs on 7900xtx) in megabytes, default 256.

View file

@ -4,13 +4,13 @@ import struct
import json
import traceback
import numpy as np
from tinygrad.runtime.ops_gpu import CLProgram, compile_gpu
from tinygrad.runtime.ops_cl import CLProgram, compile_gpu
from tinygrad.device import Device
from tinygrad.helpers import DEBUG, getenv
from collections import defaultdict
import pyopencl as cl
from tinygrad.runtime.ops_gpu import OSX_TIMING_RATIO
CL = Device["GPU"]
from tinygrad.runtime.ops_cl import OSX_TIMING_RATIO
CL = Device["CL"]
DEBUGCL = getenv("DEBUGCL", 0)
FLOAT16 = getenv("FLOAT16", 0)
@ -110,7 +110,7 @@ class Thneed:
prgs = {}
for o in jdat['binaries']:
nptr = ptr + o['length']
prgs[o['name']] = CLProgram(Device["GPU"], o['name'], weights[ptr:nptr])
prgs[o['name']] = CLProgram(Device["CL"], o['name'], weights[ptr:nptr])
ptr = nptr
# populate the cl_cache
@ -267,7 +267,7 @@ class Thneed:
for prg, args in self.cl_cache:
events.append(prg.clprg(CL.queue, *args))
mt = time.monotonic()
Device["GPU"].synchronize()
Device["CL"].synchronize()
et = time.monotonic() - st
print(f"submit in {(mt-st)*1000.0:.2f} ms, total runtime is {et*1000.0:.2f} ms")

View file

@ -35,7 +35,7 @@ def to_movement_ops(st: ShapeTracker) -> List[Tuple[MovementOps, Tuple]]:
to_apply:List[Tuple[MovementOps, Tuple]] = []
for i, v in enumerate(st.views):
real_shape = tuple(y-x for x,y in v.mask) if v.mask else v.shape
offset = v.offset + sum(st*(s-1) for s,st in zip(real_shape, v.strides) if st<0)
offset = (v.offset or 0) + sum(st*(s-1) for s,st in zip(real_shape, v.strides) if st<0)
real_offset = offset + (sum(x*st for (x,_),st in zip(v.mask, v.strides)) if v.mask else 0)
real_real_shape = [s for s,st in zip(real_shape, v.strides) if st]
strides: List[int] = [abs(st) if isinstance(st,int) else st for st in v.strides if st]

View file

@ -177,22 +177,28 @@ def cached_to_movement_ops(shape, st) -> list:
from tinygrad.shape.shapetracker import ShapeTracker, View
from extra.to_movement_ops import to_movement_ops, apply_mop, MovementOps
@wrap_view_op
def _as_strided(tensor:Tensor, size, stride, storage_offset=None):
# multiple as_strided do not compound
base = canonical_base(tensor)
# TODO: this is heavyweight
st = ShapeTracker(base.uop.st.views + (View.create(tuple(size), tuple(stride), storage_offset),))
ret = base
if TORCH_DEBUG >= 1: print("**** as_strided", tensor.shape, size, stride, st)
if prod(size) == 1: return ret.flatten()[storage_offset].reshape(size)
for mo in cached_to_movement_ops(tuple(base.shape), st): ret = apply_mop(ret, mo)
return ret
@torch.library.impl("aten::as_strided", "privateuseone")
def as_strided(tensor:torch.Tensor, size, stride, storage_offset=None):
storage_offset = storage_offset or tensor.storage_offset()
@wrap_view_op
def _as_strided(tensor:Tensor, size, stride, storage_offset=None):
# multiple as_strided do not compound
base = canonical_base(tensor)
# TODO: this is heavyweight
st = ShapeTracker(base.uop.st.views + (View.create(tuple(size), tuple(stride), storage_offset),))
ret = base
if TORCH_DEBUG >= 1: print("**** as_strided", tensor.shape, size, stride, st)
if prod(size) == 1: return ret.flatten()[storage_offset].reshape(size)
for mo in cached_to_movement_ops(tuple(base.shape), st): ret = apply_mop(ret, mo)
return ret
return _as_strided(tensor, size, stride, storage_offset)
@torch.library.impl("aten::_reshape_alias", "privateuseone")
def _reshape_alias(tensor:torch.Tensor, size, stride):
return _as_strided(tensor, size, stride)
@torch.library.impl("aten::empty_strided", "privateuseone")
def empty_strided(size, stride, dtype, layout=None, device=None, pin_memory=False):
if TORCH_DEBUG: print(f"empty_strided {size=} {stride=} {dtype=} {layout=} {device=} {pin_memory=}")

View file

@ -1,2 +1,6 @@
[pytest]
norecursedirs = extra
timeout = 180
timeout_method = thread
timeout_func_only = true
testpaths = test

View file

@ -9,12 +9,12 @@ with open(directory / 'README.md', encoding='utf-8') as f:
testing_minimal = [
"numpy",
"torch==2.7.1",
"torch==2.8.0",
"pytest",
"pytest-xdist",
"pytest-timeout",
"hypothesis",
"z3-solver",
"ml_dtypes"
]
setup(name='tinygrad',
@ -59,7 +59,7 @@ setup(name='tinygrad',
'triton': ["triton-nightly>=2.1.0.dev20231014192330"],
'linting': [
"pylint",
"mypy==1.13.0",
"mypy==1.18.1",
"typing-extensions",
"pre-commit",
"ruff",

View file

@ -3,9 +3,9 @@ from tinygrad import Device
from tinygrad.device import Buffer
from tinygrad.dtype import dtypes
from tinygrad.helpers import CI
from tinygrad.runtime.ops_gpu import CLDevice, CLAllocator, CLCompiler, CLProgram
from tinygrad.runtime.ops_cl import CLDevice, CLAllocator, CLCompiler, CLProgram
@unittest.skipUnless(Device.DEFAULT == "GPU", "Runs only on OpenCL (GPU)")
@unittest.skipUnless(Device.DEFAULT == "CL", "Runs only on OpenCL")
class TestCLError(unittest.TestCase):
@unittest.skipIf(CI, "dangerous for CI, it allocates tons of memory")
def test_oom(self):
@ -24,7 +24,7 @@ class TestCLError(unittest.TestCase):
def test_unaligned_copy(self):
data = list(range(65))
unaligned = memoryview(bytearray(data))[1:]
buffer = Buffer("GPU", 64, dtypes.uint8).allocate()
buffer = Buffer("CL", 64, dtypes.uint8).allocate()
buffer.copyin(unaligned)
result = memoryview(bytearray(len(data) - 1))
buffer.copyout(result)

View file

@ -10,10 +10,11 @@ class TestQcom(unittest.TestCase):
def __validate(imgdt, expected_pitch):
img = dev.allocator.alloc(imgdt.shape[0] * imgdt.shape[1] * 16, options:=BufferSpec(image=imgdt))
pitch = (img.descriptor[2] & 0x1fffff80) >> 7
pitch = img.texture_info.pitch
assert pitch == expected_pitch, f"Failed pitch for image: {imgdt}. Got 0x{pitch:X}, expected 0x{expected_pitch:X}"
dev.allocator.free(img, imgdt.shape[0] * imgdt.shape[1] * 16, options)
# Match opencl pitches for perf
__validate(dtypes.imageh((1, 201)), 0x680)
__validate(dtypes.imageh((16, 216)), 0x700)
__validate(dtypes.imageh((16, 9)), 0x80)

View file

@ -1,7 +1,7 @@
import random, os
from tinygrad.helpers import Timing
from tinygrad.runtime.ops_hip import compile_hip, HIPDevice
from tinygrad.runtime.ops_gpu import compile_cl, CLDevice
from tinygrad.runtime.ops_cl import compile_cl, CLDevice
# OMP_NUM_THREADS=1 strace -tt -f -e trace=file python3 test/external/external_benchmark_hip_compile.py
# AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1 python3 test/external/external_benchmark_hip_compile.py

View file

@ -1,4 +1,4 @@
from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, compile_cl
from tinygrad.runtime.ops_cl import CLDevice, CLProgram, compile_cl
if __name__ == "__main__":
dev = CLDevice()

View file

@ -1,5 +1,5 @@
# ugh, OS X OpenCL doesn't support half
from tinygrad.runtime.ops_gpu import CLDevice, CLProgram, CLCompiler
from tinygrad.runtime.ops_cl import CLDevice, CLProgram, CLCompiler
src = """#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void max_half(__global half* data0, const __global half* data1) {

View file

@ -1,6 +1,6 @@
#!/usr/bin/env python3
# cd extra/disassemblers/ && git clone --recursive github.com:geohot/cuda_ioctl_sniffer.git
# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so GPU=1 python3 test/external/external_multi_gpu.py
# LD_PRELOAD=$PWD/extra/disassemblers/cuda_ioctl_sniffer/out/sniff.so CL=1 python3 test/external/external_multi_gpu.py
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad.helpers import colored, Timing, getenv

View file

@ -1,4 +1,4 @@
from tinygrad.runtime.ops_gpu import CLProgram, CL, CLBuffer
from tinygrad.runtime.ops_cl import CLProgram, CL, CLBuffer
from tinygrad import dtypes
import time

File diff suppressed because one or more lines are too long

View file

@ -4,7 +4,7 @@ import unittest
import numpy as np
if 'IMAGE' not in os.environ:
os.environ['IMAGE'] = '2'
os.environ['GPU'] = '1'
os.environ['CL'] = '1'
os.environ['OPT'] = '2'
from tinygrad.tensor import Tensor
from tinygrad.nn import Conv2d

View file

@ -193,12 +193,12 @@ backend_test.exclude('test_adam_cpu')
backend_test.exclude('test_gradient_of_add_and_mul_cpu')
backend_test.exclude('test_gradient_of_add_cpu')
if Device.DEFAULT in ['GPU', 'METAL']:
if Device.DEFAULT in ['CL', 'METAL']:
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_2_3_cpu')
backend_test.exclude('test_resize_upsample_sizes_nearest_axes_3_2_cpu')
backend_test.exclude('test_resize_upsample_sizes_nearest_cpu')
if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "GPU"):
if Device.DEFAULT == "METAL" or (OSX and Device.DEFAULT == "CL"):
# numerical inaccuracy
backend_test.exclude('test_mish_cpu')
backend_test.exclude('test_mish_expanded_cpu')

View file

@ -4,7 +4,7 @@ import numpy as np
import torch
from tinygrad import GlobalCounters, Tensor, Device
from tinygrad.helpers import getenv, Context
from tinygrad.helpers import getenv, Context, RANGEIFY
from tinygrad.nn.state import get_parameters
from tinygrad.engine.realize import capturing
from tinygrad.tensor import _to_np_dtype
@ -34,7 +34,7 @@ from extra.models.efficientnet import EfficientNet
from extra.models.resnet import ResNet18
from extra.models.vit import ViT
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
class TestInferenceMinKernels(unittest.TestCase):
def setUp(self):
self.training_old = Tensor.training
@ -90,7 +90,7 @@ class TestInferenceMinKernels(unittest.TestCase):
with CLCache(100):
model(inp, 0).realize()
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
class TestOptBinOp(unittest.TestCase):
def _test_no_binop_rerun(self, f1, f2=None, allowed=1):
a = Tensor.randn(16, 16)
@ -106,7 +106,7 @@ class TestOptBinOp(unittest.TestCase):
def test_no_binop_rerun(self): return self._test_no_binop_rerun(lambda a,b: a*b, lambda a,b: (a*b).reshape(16, 16, 1))
def test_no_binop_rerun_alt(self): return self._test_no_binop_rerun(lambda a,b: (a*b).reshape(16, 16, 1), lambda a,b: a*b)
def test_no_binop_rerun_reduce_broadcast(self):
return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=2)
return self._test_no_binop_rerun(lambda a,b: a.sum()+b, lambda a,b: a.sum().reshape(1,1)+b, allowed=1 if RANGEIFY else 2)
@unittest.skip("this test started failing with the new change, based movementop issue")
def test_no_binop_rerun_transposed(self): return self._test_no_binop_rerun(lambda a,b: (a.T*b.T).T, lambda a,b: a*b)
@ -117,7 +117,7 @@ class TestOptBinOp(unittest.TestCase):
#def test_no_binop_rerun_reduce(self): return self._test_no_binop_rerun(lambda a,b: (a*b).sum(), lambda a,b: (a*b).reshape(16, 16, 1).sum())
#def test_no_binop_rerun_reduce_alt(self): return self._test_no_binop_rerun(lambda a,b: a.sum(1)+b[0], lambda a,b: a.sum(1).reshape(1,16)+b[0])
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
class TestOptReduceLoop(unittest.TestCase):
def test_loop_left(self):
a = Tensor.randn(16, 16)
@ -139,7 +139,7 @@ class TestOptReduceLoop(unittest.TestCase):
c.realize()
assert cache.count == 2, "loop right fusion broken"
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
class TestOptWChild(unittest.TestCase):
@unittest.skip("this no longer happens, use realize")
def test_unrealized_child(self):
@ -152,7 +152,7 @@ class TestOptWChild(unittest.TestCase):
d.realize()
assert cache.count == 2, "don't fuse if you have children"
@unittest.skipUnless(Device.DEFAULT == "GPU", "Not Implemented")
@unittest.skipUnless(Device.DEFAULT == "CL", "Not Implemented")
class TestOpt(unittest.TestCase):
def test_muladd(self):
a,b,c = [Tensor.randn(2,2).realize() for _ in range(3)]
@ -164,7 +164,7 @@ class TestOpt(unittest.TestCase):
def test_permute_was_pushed(self):
a = Tensor.randn(16, 16, 16)
with CLCache(2):
with CLCache(1 if RANGEIFY else 2):
c = a.sum(2)
d = c.permute(1,0).contiguous()
d.realize()
@ -172,7 +172,7 @@ class TestOpt(unittest.TestCase):
def test_permute_was_pushed_through_contract_reshape(self):
a = Tensor.randn(4, 4, 4, 4, 4)
with CLCache(2):
with CLCache(1 if RANGEIFY else 2):
c = a.sum(-1)
d = c.reshape(16,16).permute(1,0).contiguous()
d.realize()
@ -180,7 +180,7 @@ class TestOpt(unittest.TestCase):
def test_permute_was_pushed_through_contractw1s_reshape(self):
a = Tensor.randn(4, 4, 4, 4, 4)
with CLCache(2):
with CLCache(1 if RANGEIFY else 2):
c = a.sum(-1)
d = c.reshape(16,1,16).permute(2,1,0).contiguous()
d.realize()
@ -188,7 +188,7 @@ class TestOpt(unittest.TestCase):
def test_permute_was_pushed_through_expand_reshape(self):
a = Tensor.randn(16, 16, 16)
with CLCache(2):
with CLCache(1 if RANGEIFY else 2):
c = a.sum(2)
d = c.reshape(4,4,4,4).permute(2,3,0,1).contiguous()
d.realize()

View file

@ -20,7 +20,7 @@ class TestLLaMASpeed(unittest.TestCase):
def test_llama_compile(self):
backup_program = Device[Device.DEFAULT].runtime
backup_allocator = Device[Device.DEFAULT].allocator
backup_compiler = Device[Device.DEFAULT].compiler
backup_compiler = Device[Device.DEFAULT].compiler.compile_cached
Device[Device.DEFAULT].runtime = FakeProgram
Device[Device.DEFAULT].allocator = FakeAllocator(Device.default)
@ -44,14 +44,14 @@ class TestLLaMASpeed(unittest.TestCase):
run_llama("codegen(1)")
# test no compiler use for this
Device[Device.DEFAULT].compiler = None
Device[Device.DEFAULT].compiler.compile_cached = None
run_llama("methodcache", False)
with Profiling(sort='time', frac=0.1, fn="/tmp/llama.prof", ts=5):
run_llama("profile", False)
Device[Device.DEFAULT].runtime = backup_program
Device[Device.DEFAULT].allocator = backup_allocator
Device[Device.DEFAULT].compiler = backup_compiler
Device[Device.DEFAULT].compiler.compile_cached = backup_compiler
if __name__ == '__main__':
TestLLaMASpeed().test_llama_compile()

View file

@ -1,6 +1,6 @@
import gc
from tinygrad import Tensor, UOp, Device
from tinygrad.shape.shapetracker import views_to_indexed_uops
from tinygrad.shape.shapetracker import views_to_valid_uop
from tinygrad.engine.realize import method_cache, get_program
def uops_allocated(): return sum([isinstance(x, UOp) for x in gc.get_objects()])
@ -60,7 +60,7 @@ if __name__ == "__main__":
# these caches will keep uops alive
method_cache.clear()
views_to_indexed_uops.cache_clear()
views_to_valid_uop.cache_clear()
new_uops = uops_allocated()
gc.collect()

View file

@ -11,7 +11,7 @@ if __name__ == "__main__":
for i in range(10_000):
if i % 1000 == 0:
print(f"Progress: {i}")
dt = random.choice(dtypes.ints)
dt = random.choice(dtypes.ints + tuple(dt.vec(4) for dt in dtypes.ints))
u = UOp.variable('x', random.randint(dt.min, 0), random.randint(1, dt.max), dtype=dt)
d = random.randint(1, max(1, u.arg[2]))
if d in powers_of_two: continue

View file

@ -16,7 +16,7 @@ if os.getenv("VALIDATE_HCQ", 0) != 0:
try:
import extra.qcom_gpu_driver.opencl_ioctl
from tinygrad import Device
_, _ = Device["QCOM"], Device["GPU"]
_, _ = Device["QCOM"], Device["CL"]
except Exception: pass
from tinygrad import Tensor, Device, dtypes
@ -42,9 +42,9 @@ if getenv("VALIDATE_HCQ"):
on_linearizer_did_run = extra.nv_gpu_driver.nv_ioctl.collect_last_launch_state
compare_states = extra.nv_gpu_driver.nv_ioctl.compare_launch_state
elif Device.DEFAULT == "QCOM":
print("VALIDATE_HCQ: Comparing QCOM to GPU")
print("VALIDATE_HCQ: Comparing QCOM to CL")
import extra.qcom_gpu_driver.opencl_ioctl
validate_device = Device["GPU"]
validate_device = Device["CL"]
on_linearizer_will_run = extra.qcom_gpu_driver.opencl_ioctl.before_launch
on_linearizer_did_run = extra.qcom_gpu_driver.opencl_ioctl.collect_last_launch_state
compare_states = extra.qcom_gpu_driver.opencl_ioctl.compare_launch_state
@ -302,7 +302,7 @@ if __name__ == "__main__":
for i, ast in enumerate(ast_strs[:getenv("FUZZ_N", len(ast_strs))]):
if (nth := getenv("FUZZ_NTH", -1)) != -1 and i != nth: continue
if getenv("FUZZ_IMAGEONLY") and "dtypes.image" not in ast: continue
if "dtypes.image" in ast and Device.DEFAULT not in {"GPU", "QCOM"}: continue # IMAGE is only for GPU
if "dtypes.image" in ast and Device.DEFAULT not in {"CL", "QCOM"}: continue # IMAGE is only for CL
if ast in seen_ast_strs: continue
seen_ast_strs.add(ast)

View file

@ -57,8 +57,8 @@ def eval_uop(uop:UOp, inputs:list[tuple[DType, list[Any]]]|None=None):
return out_buf.cast(uop.dtype.fmt).tolist()[0]
def not_support_multi_device():
# GPU and CUDA don't support multi device if in CI
return CI and REAL_DEV in ("GPU", "CUDA")
# CL and CUDA don't support multi device if in CI
return CI and REAL_DEV in ("CL", "CUDA")
# NOTE: This will open REMOTE if it's the default device
REAL_DEV = (Device.DEFAULT if Device.DEFAULT != "REMOTE" else Device['REMOTE'].properties.real_device)

View file

@ -1,14 +1,14 @@
#!/usr/bin/env python
import unittest
from tinygrad import Tensor
import numpy as np
from tinygrad.tensor import Tensor
import torch
def get_question_samp(bsz, seq_len, vocab_size, seed):
np.random.seed(seed)
in_ids= np.random.randint(vocab_size, size=(bsz, seq_len))
mask = np.random.choice([True, False], size=(bsz, seq_len))
seg_ids = np.random.randint(1, size=(bsz, seq_len))
seg_ids = np.random.randint(2, size=(bsz, seq_len)) # type_vocab_size
return in_ids, mask, seg_ids
def set_equal_weights(mdl, torch_mdl):
@ -45,7 +45,7 @@ class TestBert(unittest.TestCase):
seeds = (1337, 3141)
bsz, seq_len = 1, 16
for _, seed in enumerate(seeds):
for seed in seeds:
in_ids, mask, seg_ids = get_question_samp(bsz, seq_len, config['vocab_size'], seed)
out = mdl(Tensor(in_ids), Tensor(mask), Tensor(seg_ids))
torch_out = torch_mdl.forward(torch.from_numpy(in_ids).long(), torch.from_numpy(mask), torch.from_numpy(seg_ids).long())[:2]

View file

@ -1,12 +1,10 @@
import ast
import pathlib
import unittest
import ast, pathlib, unittest
import numpy as np
from PIL import Image
from tinygrad.helpers import getenv
from tinygrad.tensor import Tensor
from tinygrad import Tensor
from tinygrad.helpers import getenv, CI
from extra.models.efficientnet import EfficientNet
from extra.models.vit import ViT
from extra.models.resnet import ResNet50
@ -40,19 +38,13 @@ def preprocess(img, new=False):
img /= np.array([0.229, 0.224, 0.225]).reshape((1, -1, 1, 1))
return img
def _infer(model: EfficientNet, img):
with Tensor.train(False):
out = model.forward(Tensor(img)).argmax(axis=-1)
return out.tolist()
def _infer(model: EfficientNet, img, bs=1):
old_training = Tensor.training
Tensor.training = False
img = preprocess(img)
# run the net
if bs > 1: img = img.repeat(bs, axis=0)
out = model.forward(Tensor(img))
Tensor.training = old_training
return _LABELS[np.argmax(out.numpy()[0])]
chicken_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg')
car_img = Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg')
chicken_img = preprocess(Image.open(pathlib.Path(__file__).parent / 'efficientnet/Chicken.jpg'))
car_img = preprocess(Image.open(pathlib.Path(__file__).parent / 'efficientnet/car.jpg'))
class TestEfficientNet(unittest.TestCase):
@classmethod
@ -64,17 +56,20 @@ class TestEfficientNet(unittest.TestCase):
def tearDownClass(cls):
del cls.model
@unittest.skipIf(CI, "covered by test_chicken_car")
def test_chicken(self):
label = _infer(self.model, chicken_img)
self.assertEqual(label, "hen")
def test_chicken_bigbatch(self):
label = _infer(self.model, chicken_img, 2)
self.assertEqual(label, "hen")
labels = _infer(self.model, chicken_img)
self.assertEqual(_LABELS[labels[0]], "hen")
@unittest.skipIf(CI, "covered by test_chicken_car")
def test_car(self):
label = _infer(self.model, car_img)
self.assertEqual(label, "sports car, sport car")
labels = _infer(self.model, car_img)
self.assertEqual(_LABELS[labels[0]], "sports car, sport car")
def test_chicken_car(self):
labels = _infer(self.model, np.concat([chicken_img, car_img], axis=0))
self.assertEqual(_LABELS[labels[0]], "hen")
self.assertEqual(_LABELS[labels[1]], "sports car, sport car")
class TestViT(unittest.TestCase):
@classmethod
@ -87,12 +82,12 @@ class TestViT(unittest.TestCase):
del cls.model
def test_chicken(self):
label = _infer(self.model, chicken_img)
self.assertEqual(label, "cock")
labels = _infer(self.model, chicken_img)
self.assertEqual(_LABELS[labels[0]], "cock")
def test_car(self):
label = _infer(self.model, car_img)
self.assertEqual(label, "racer, race car, racing car")
labels = _infer(self.model, car_img)
self.assertEqual(_LABELS[labels[0]], "racer, race car, racing car")
class TestResNet(unittest.TestCase):
@classmethod
@ -105,12 +100,12 @@ class TestResNet(unittest.TestCase):
del cls.model
def test_chicken(self):
label = _infer(self.model, chicken_img)
self.assertEqual(label, "hen")
labels = _infer(self.model, chicken_img)
self.assertEqual(_LABELS[labels[0]], "hen")
def test_car(self):
label = _infer(self.model, car_img)
self.assertEqual(label, "sports car, sport car")
labels = _infer(self.model, car_img)
self.assertEqual(_LABELS[labels[0]], "sports car, sport car")
if __name__ == '__main__':
unittest.main()

View file

@ -5,12 +5,8 @@ from tinygrad.frontend.onnx import OnnxRunner
from tinygrad.device import Device
from tinygrad.helpers import fetch, Context
try:
from extra.onnx_helpers import validate
from extra.huggingface_onnx.huggingface_manager import DOWNLOADS_DIR, snapshot_download_with_retry
HUGGINGFACE_AVAILABLE = True
except ModuleNotFoundError:
HUGGINGFACE_AVAILABLE = False
from extra.onnx_helpers import validate
from extra.huggingface_onnx.huggingface_manager import DOWNLOADS_DIR, snapshot_download_with_retry
def run_onnx_torch(onnx_model, inputs):
import torch
@ -62,7 +58,7 @@ class TestOnnxModel(unittest.TestCase):
print(cls, _LABELS[cls])
assert "car" in _LABELS[cls] or _LABELS[cls] == "convertible"
@unittest.skipUnless(HUGGINGFACE_AVAILABLE and Device.DEFAULT == "METAL", "only run on METAL")
@unittest.skipUnless(Device.DEFAULT == "METAL", "only run on METAL")
class TestHuggingFaceOnnxModels(unittest.TestCase):
@classmethod
def setUpClass(cls):

View file

@ -53,8 +53,8 @@ class TestRealWorld(unittest.TestCase):
@unittest.skipUnless(is_dtype_supported(dtypes.float16), "need dtypes.float16")
def test_stable_diffusion(self):
params = unet_params
params["model_ch"] = 16
params["ctx_dim"] = 16
params["model_ch"] = 8
params["ctx_dim"] = 8
params["num_res_blocks"] = 1
params["n_heads"] = 2
model = UNetModel(**params)
@ -114,7 +114,7 @@ class TestRealWorld(unittest.TestCase):
helper_test("train_mnist", lambda: (Tensor.randn(BS, 1, 28, 28),), train, 0.07, 93)
@unittest.skipIf(CI and Device.DEFAULT in {"CPU", "GPU"}, "slow")
@unittest.skipIf(CI and Device.DEFAULT in {"CPU", "CL"}, "slow")
def test_train_cifar(self):
with Tensor.train():
model = SpeedyResNet(Tensor.ones((12,3,2,2)))
@ -144,6 +144,7 @@ class TestRealWorld(unittest.TestCase):
final_div_factor=1./(initial_div_factor*final_lr_ratio), total_steps=4)
assert not np.isnan(lr_scheduler.min_lr), "lr too small or initial_div_facotr too big for half"
@unittest.skipIf(CI and Device.DEFAULT == "CPU", "slow")
def test_bert(self):
with Tensor.train():
args_tiny = {"attention_probs_dropout_prob": 0.0, "hidden_dropout_prob": 0.0, "vocab_size": 30522, "type_vocab_size": 2,
@ -167,9 +168,5 @@ class TestRealWorld(unittest.TestCase):
helper_test("train_bert", lambda: (data["input_ids"], data["segment_ids"], data["input_mask"], data["masked_lm_positions"], \
data["masked_lm_ids"], data["masked_lm_weights"], data["next_sentence_labels"]), train, 0.25, 347)
def test_bert_fuse_arange(self):
with Context(FUSE_ARANGE=1):
self.test_bert()
if __name__ == '__main__':
unittest.main()

View file

@ -1,8 +1,8 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.tensor import Tensor
from tinygrad import Tensor
from extra.models.rnnt import LSTM
import numpy as np
import torch
class TestRNNT(unittest.TestCase):

View file

@ -1,9 +1,8 @@
import unittest
import time
import unittest, time
import numpy as np
from tinygrad import Device
from tinygrad.nn.state import get_parameters
from tinygrad.nn import optim
from tinygrad.tensor import Device
from tinygrad.helpers import getenv, CI
from extra.training import train
from extra.models.convnext import ConvNeXt
@ -27,7 +26,7 @@ def train_one_step(model,X,Y):
print("done in %.2f ms" % (et*1000.))
def check_gc():
if Device.DEFAULT == "GPU":
if Device.DEFAULT == "CL":
from extra.introspection import print_objects
assert print_objects() == 0
@ -40,7 +39,6 @@ class TestTrain(unittest.TestCase):
check_gc()
@unittest.skipIf(CI, "slow")
@unittest.skipIf(Device.DEFAULT in ["METAL", "WEBGPU"], "too many buffers for webgpu and metal")
def test_efficientnet(self):
model = EfficientNet(0)
X = np.zeros((BS,3,224,224), dtype=np.float32)
@ -49,7 +47,6 @@ class TestTrain(unittest.TestCase):
check_gc()
@unittest.skipIf(CI, "slow")
@unittest.skipIf(Device.DEFAULT in ["METAL", "WEBGPU"], "too many buffers for webgpu and metal")
def test_vit(self):
model = ViT()
X = np.zeros((BS,3,224,224), dtype=np.float32)
@ -57,7 +54,7 @@ class TestTrain(unittest.TestCase):
train_one_step(model,X,Y)
check_gc()
@unittest.skipIf(Device.DEFAULT in ["METAL", "WEBGPU"], "too many buffers for webgpu and metal")
@unittest.skipIf(CI, "slow")
def test_transformer(self):
# this should be small GPT-2, but the param count is wrong
# (real ff_dim is 768*4)

View file

@ -1,6 +1,6 @@
import unittest
from tinygrad import Device, Tensor, dtypes
from tinygrad.helpers import CI
from tinygrad.helpers import CI, RANGEIFY
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
# TODO: write a clean version of this
@ -93,7 +93,7 @@ class TestKernelOpts(unittest.TestCase):
a = Tensor.rand(8, N, 8, N)
r = a.sum(axis=(1,3))
helper_linearizer_opt(r, [
# openCL / GPU=1 is 256 max threads
# openCL / CL=1 is 256 max threads
[Opt(OptOps.GROUPTOP, 0, 2)], [Opt(OptOps.GROUPTOP, 0, 32)],
[Opt(OptOps.GROUPTOP, 1, 2)], [Opt(OptOps.GROUPTOP, 1, 32)], # Checking how it works with 1 grouped_reduce.
[Opt(OptOps.GROUPTOP, 0, 2), Opt(OptOps.GROUPTOP, 1, 2)],
@ -327,13 +327,14 @@ class TestKernelOpts(unittest.TestCase):
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
def test_arange_opts(self):
a = Tensor.arange(128)
# NOTE: arange no longer has reduce ops available for opt
helper_linearizer_opt(a, [
[Opt(OptOps.GROUP, 0, 32)],
[Opt(OptOps.GROUPTOP, 0, 32)],
#[Opt(OptOps.GROUP, 0, 32)],
#[Opt(OptOps.GROUPTOP, 0, 32)],
[Opt(op=OptOps.LOCAL, axis=0, arg=8)],
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0)],
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8)],
[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], # noqa: E501
#[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8)],
#[Opt(op=OptOps.LOCAL, axis=0, arg=8), Opt(op=OptOps.UPCAST, axis=0, arg=0), Opt(op=OptOps.GROUP, axis=0, arg=8), Opt(op=OptOps.UNROLL, axis=1, arg=4)], # noqa: E501
])
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_threads, "test requires threads")
@ -350,5 +351,18 @@ class TestKernelOpts(unittest.TestCase):
] + [[Opt(OptOps.THREAD, 0, 4)] if Device[Device.DEFAULT].renderer.global_max[0] >= 4 else []]
+ [[Opt(OptOps.THREAD, 0, 8)] if Device[Device.DEFAULT].renderer.global_max[0] >= 8 else []])
@unittest.skipUnless(RANGEIFY>=1, "Kernel only fuses with rangeify")
def test_double_sum_group(self):
a = Tensor.rand(4, 4, 4)
r = a.sum((1, 2)).sum()
with self.assertRaises(KernelOptError):
helper_linearizer_opt(r, [[Opt(OptOps.GROUPTOP, 0, 16)],])
r = a.sum((1, 2)).sum()
with self.assertRaises(KernelOptError):
helper_linearizer_opt(r, [[Opt(OptOps.UNROLL, 1, 4), Opt(OptOps.GROUPTOP, 0, 16)],])
r = a.sum((1, 2)).sum()
with self.assertRaises(KernelOptError):
helper_linearizer_opt(r, [[Opt(OptOps.GROUPTOP, 1, 4), Opt(OptOps.GROUPTOP, 0, 16)],])
if __name__ == '__main__':
unittest.main()

View file

@ -77,9 +77,9 @@ class TestCopySpeed(unittest.TestCase):
np.testing.assert_equal(t.numpy(), x.numpy())
@unittest.skipIf(CI, "CI doesn't have 6 GPUs")
@unittest.skipIf(Device.DEFAULT != "GPU", "only test this on GPU")
@unittest.skipIf(Device.DEFAULT != "CL", "only test this on CL")
def testCopyCPUto6GPUs(self):
from tinygrad.runtime.ops_gpu import CLDevice
from tinygrad.runtime.ops_cl import CLDevice
if len(CLDevice.device_ids) != 6: raise unittest.SkipTest("computer doesn't have 6 GPUs")
t = Tensor.ones(N, N, device="CPU").contiguous().realize()
print(f"buffer: {t.nbytes()*1e-9:.2f} GB")
@ -87,8 +87,8 @@ class TestCopySpeed(unittest.TestCase):
with Timing("sync: ", on_exit=lambda ns: f" @ {t.nbytes()/ns:.2f} GB/s ({t.nbytes()*6/ns:.2f} GB/s total)"):
with Timing("queue: "):
for g in range(6):
t.to(f"gpu:{g}").realize()
Device["gpu"].synchronize()
t.to(f"CL:{g}").realize()
Device["CL"].synchronize()
if __name__ == '__main__':
unittest.main()

View file

@ -1,51 +1,29 @@
import unittest
import numpy as np
from tinygrad import Tensor, GlobalCounters, dtypes, nn, Device, Variable
from tinygrad.helpers import CI, Context, getenv
from tinygrad.helpers import CI, Context, getenv, RANGEIFY
from tinygrad.engine.realize import run_schedule
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.engine.realize import CompiledRunner, ExecItem, get_program
from tinygrad.uop.ops import Ops
class TestArange(unittest.TestCase):
def _get_flops(self, N, opts=None):
def _get_flops(self, N):
GlobalCounters.reset()
tt = Tensor.arange(N)
sched = tt.schedule()
self.assertEqual(len(sched), 1)
p = get_program(sched[-1].ast, opts=opts)
print(p.name)
#print(p.src)
p = get_program(sched[-1].ast)
ExecItem(CompiledRunner(p), [tt.uop.buffer]).run()
np.testing.assert_equal(tt.numpy(), np.arange(N))
return p.estimates.ops
def test_complexity(self, opts=None, limit=None):
f1 = self._get_flops(256, opts)
f2 = self._get_flops(2560, opts)
print(f"{f1=}, {f2=}")
# add 1 to avoid divide by 0. arange is 0 flops now!
assert (f1 < 6000 and f2 < 6000) or ((f2+1) / (f1+1) < 16), f"bad complexity, flops {(f2+1) / (f1+1):.1f}X while inputs 10X"
if limit is not None and not getenv("PTX"):
# PTX counts index ALU in flops
assert f1 <= limit, f"{f1=}, {limit=}"
def test_complexity(self):
self.assertEqual(self._get_flops(256), 0)
self.assertEqual(self._get_flops(2560), 0)
def test_complexity_w_upcast(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4)], limit=0)
def test_complexity_w_unroll2(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 2)], limit=0)
def test_complexity_w_unroll4(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 4)], limit=0)
def test_complexity_w_unroll8(self): return self.test_complexity([Opt(OptOps.UNROLL, 0, 8)], limit=0)
def test_complexity_w_upcast_and_unroll(self): return self.test_complexity([Opt(OptOps.UPCAST, 0, 4), Opt(OptOps.UNROLL, 0, 4)], limit=0)
if Device.default.renderer.has_local:
# TODO: fix limit
def test_complexity_w_group(self): return self.test_complexity([Opt(OptOps.GROUP, 0, 16)], limit=81920)
def test_complexity_w_group_top(self): return self.test_complexity([Opt(OptOps.GROUPTOP, 0, 16)], limit=106496)
def test_complexity_w_local(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16)], limit=0)
@unittest.skip("doesn't work yet. TODO: this absolutely should work")
def test_complexity_w_local_unroll4(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.UNROLL, 0, 4)], limit=0)
@unittest.skip("doesn't work yet")
def test_complexity_w_local_and_padto(self): return self.test_complexity([Opt(OptOps.LOCAL, 0, 16), Opt(OptOps.PADTO, axis=1, arg=32)])
def test_arange_cat(self):
t = Tensor.arange(2, dtype=dtypes.int)+Tensor([3])
self.assertEqual(t.cat(t).tolist(), [3, 4, 3, 4])
class TestRand(unittest.TestCase):
def test_fused_rand_less_ops(self, noopt=1):
@ -133,7 +111,7 @@ class TestIndexing(unittest.TestCase):
X = dataset[idxs]
assert X.shape == (4,DDIM)
sched = X.schedule()
self.assertEqual(len(sched), 2)
self.assertEqual(len(sched), 1 if RANGEIFY else 2)
run_schedule(sched)
assert GlobalCounters.global_ops < 4*DSET, f"too many ops {GlobalCounters.global_ops} != {4*DSET}"
np.testing.assert_allclose(real_index, X.numpy())

View file

@ -3,6 +3,7 @@ from tinygrad import Tensor, Device, dtypes
from tinygrad.dtype import DType, ConstType
from tinygrad.uop.ops import Ops, UOp
from tinygrad.codegen import full_rewrite_to_sink
from tinygrad.helpers import RANGEIFY
from tinygrad.device import is_dtype_supported
import numpy as np
from test.helpers import not_support_multi_device
@ -155,7 +156,7 @@ class TestMovedConstFolding(unittest.TestCase):
def test_add_padded_zero(self):
# TODO: it's 1 now, this might be possible to fold
_check_ast_count(1, Tensor([1.0, 2, 3, 4]) + Tensor.zeros(2).pad(((1, 1),)))
_check_ast_count(0 if RANGEIFY else 1, Tensor([1.0, 2, 3, 4]) + Tensor.zeros(2).pad(((1, 1),)))
def test_mul_shrunk_one(self):
_check_ast_count(0, Tensor([1.0, 2, 3, 4]) * Tensor.ones(6).shrink(((1, 5),)))

View file

@ -4,12 +4,13 @@ import torch
from typing import Any, List
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import getenv, DEBUG, CI
from tinygrad.dtype import DType, DTYPES_DICT, least_upper_dtype, fp8_to_float, float_to_fp8, _to_np_dtype, _to_torch_dtype
from tinygrad.dtype import DType, DTYPES_DICT, least_upper_dtype, fp8_to_float, float_to_fp8, _to_np_dtype, _to_torch_dtype, truncate
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.renderer.nir import NIRRenderer
from tinygrad import Device, Tensor, dtypes
from hypothesis import assume, given, settings, strategies as strat
from test.helpers import rand_for_dtype
from test.unit.test_dtype_spec import _assert_eq, core_dtypes, dtype_ints, dtype_floats, FP8E4M3_MAX, FP8E5M2_MAX
import ml_dtypes
import pytest
pytestmark = pytest.mark.filterwarnings("ignore")
@ -25,6 +26,7 @@ def get_available_cast_dtypes(dtype: DType) -> List[DType]:
def _to_torch_storage_type(dtype:DType):
if dtype == dtypes.bfloat16: return torch.float32
if dtype in dtypes.fp8s: return torch.float32
return _to_torch_dtype(dtype)
def _test_to_np(a:Tensor, np_dtype, target):
@ -47,12 +49,15 @@ def _test_cast(a:Tensor, target_dtype:DType):
# TODO: struct.pack cannot pack value > 65504 (max of half) into e format
a = (a > 65504).where(65504, a)
_test_op(lambda: a.cast(target_dtype), target_dtype, list(a.numpy().astype(_to_np_dtype(target_dtype))))
expected = list(a.numpy().astype(_to_np_dtype(target_dtype)))
if target_dtype in dtypes.fp8s: expected = list(map(lambda x: truncate[target_dtype](x), expected))
_test_op(lambda: a.cast(target_dtype), target_dtype, expected)
def _test_bitcast(a:Tensor, target_dtype:DType, target=None):
if getenv("PTX") and a.dtype == dtypes.int8 and target_dtype.itemsize != a.dtype.itemsize:
if isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) and a.dtype == dtypes.int8 and target_dtype.itemsize != a.dtype.itemsize:
raise unittest.SkipTest("shape changing bitcast of int8 broken on PTX")
expected = torch.tensor(a.tolist(), dtype=_to_torch_storage_type(a.dtype)).view(_to_torch_dtype(target_dtype))
_test_op(lambda: a.bitcast(target_dtype), target_dtype, target or expected.tolist())
expected = torch.tensor(a.tolist(), dtype=_to_torch_storage_type(a.dtype)).view(_to_torch_dtype(target_dtype)).tolist()
if target_dtype in dtypes.fp8s: expected = list(map(lambda x: fp8_to_float(x, target_dtype), expected))
_test_op(lambda: a.bitcast(target_dtype), target_dtype, target or expected)
class TestDType(unittest.TestCase):
DTYPE: Any = None
@ -100,8 +105,7 @@ class TestDType(unittest.TestCase):
))
@unittest.skipIf(Device.DEFAULT == "PYTHON", "skip for now")
@unittest.skipIf(getenv("PTX"), "skip for now")
@unittest.skipIf(getenv("NIR"), "skip for now")
@unittest.skipIf(isinstance(r:=Device[Device.DEFAULT].renderer, PTXRenderer) or isinstance(r, NIRRenderer), "skip for now")
def test_uint_overflow(self):
if not dtypes.is_unsigned(self.DTYPE): raise unittest.SkipTest("only for unsigned")
v = dtypes.max(self.DTYPE)
@ -129,11 +133,10 @@ class TestDType(unittest.TestCase):
np.testing.assert_allclose(tin, tor, atol=1e-6, rtol=1e-3)
def test_finfo(self):
if self.DTYPE not in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]: return
info = ml_dtypes.finfo(ml_dtypes.bfloat16 if self.DTYPE is dtypes.bfloat16 else _to_np_dtype(self.DTYPE))
assert info.bits == self.DTYPE.itemsize*8
assert info.nexp == dtypes.finfo(self.DTYPE)[0]
assert info.nmant == dtypes.finfo(self.DTYPE)[1]
if self.DTYPE not in [dtypes.float16, dtypes.float32, dtypes.float64]: return
info = np.finfo(_to_np_dtype(self.DTYPE))
self.assertEqual(info.bits, self.DTYPE.itemsize*8)
self.assertEqual((info.nexp, info.nmant), dtypes.finfo(self.DTYPE))
def _test_ops(a_dtype:DType, b_dtype:DType, target_dtype=None):
target_dtype = target_dtype or least_upper_dtype(a_dtype, b_dtype)
@ -151,7 +154,8 @@ class TestFp8s(unittest.TestCase):
class TestFp8sConversions(unittest.TestCase):
@given(strat.floats(width=32, allow_subnormal=True, allow_nan=False, allow_infinity=False, min_value=-FP8E4M3_MAX, max_value=FP8E4M3_MAX))
def test_float_to_fp8e4m3(self, x): np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e4m3), ml_dtypes.float8_e4m3fn(x).tobytes()[0])
def test_float_to_fp8e4m3(self, x):
np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e4m3), torch.tensor(x, dtype=torch.float8_e4m3fn).view(torch.uint8).item())
def test_float_to_fp8e4m3_extreme_values(self):
np.testing.assert_equal(float_to_fp8(FP8E4M3_MAX, dtypes.fp8e4m3), 126)
@ -164,7 +168,8 @@ class TestFp8sConversions(unittest.TestCase):
np.testing.assert_equal(float_to_fp8(-math.nan, dtypes.fp8e4m3), 255)
@given(strat.floats(width=32, allow_subnormal=True, allow_nan=False, allow_infinity=False, min_value=-FP8E5M2_MAX, max_value=FP8E5M2_MAX))
def test_float_to_fp8e5m2(self, x): np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e5m2), ml_dtypes.float8_e5m2(x).tobytes()[0])
def test_float_to_fp8e5m2(self, x):
np.testing.assert_equal(float_to_fp8(x, dtypes.fp8e5m2), torch.tensor(x, dtype=torch.float8_e5m2).view(torch.uint8).item())
def test_float_to_fp8e5m2_extreme_values(self):
np.testing.assert_equal(float_to_fp8(FP8E5M2_MAX, dtypes.fp8e5m2), 123)
@ -177,10 +182,12 @@ class TestFp8sConversions(unittest.TestCase):
np.testing.assert_equal(float_to_fp8(-math.nan, dtypes.fp8e5m2), 254)
@given(strat.integers(min_value=0, max_value=255))
def test_fp8e4m3_to_float(self, x): np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e4m3), np.uint8(x).view(ml_dtypes.float8_e4m3fn).item())
def test_fp8e4m3_to_float(self, x):
np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e4m3), torch.tensor(x, dtype=torch.uint8).view(torch.float8_e4m3fn).float().item())
@given(strat.integers(min_value=0, max_value=255))
def test_fp8e5m2_to_float(self, x): np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e5m2), np.uint8(x).view(ml_dtypes.float8_e5m2).item())
def test_fp8e5m2_to_float(self, x):
np.testing.assert_equal(fp8_to_float(x, dtypes.fp8e5m2), torch.tensor(x, dtype=torch.uint8).view(torch.float8_e5m2).float().item())
@unittest.skipUnless(is_dtype_supported(dtypes.bfloat16), "bfloat16 not supported")
class TestBFloat16(unittest.TestCase):
@ -256,8 +263,9 @@ class TestFloatDType(TestDType):
class TestDoubleDType(TestDType):
DTYPE = dtypes.double
@unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or getenv("PTX") or getenv("NIR"),
"conversion not supported on CI, CUDA, PTX and NIR") # TODO: why not?
@unittest.skipIf((CI and Device.DEFAULT in {"CUDA", "NV"}) or \
isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) or \
isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "conversion not supported on CI CUDA, PTX, and NIR") # TODO: why not?
def test_float64_increased_precision(self):
for func in [
lambda t: t.exp(),
@ -281,21 +289,21 @@ class TestDoubleDType(TestDType):
class TestInt8DType(TestDType):
DTYPE = dtypes.int8
@unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
@unittest.skipIf(getenv("CUDA",0)==1 or isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "cuda saturation works differently")
def test_int8_to_uint8_negative(self):
_test_op(lambda: Tensor([-1, -2, -3, -4], dtype=dtypes.int8).cast(dtypes.uint8), dtypes.uint8, [255, 254, 253, 252])
def test_int8_to_uint16_negative(self):
_test_op(lambda: Tensor([-1, -2, -3, -4], dtype=dtypes.int8).cast(dtypes.uint16), dtypes.uint16, [2**16-1, 2**16-2, 2**16-3, 2**16-4])
@unittest.skipIf(getenv("PTX"), "broken in ptx")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken in ptx")
def test_bitcast_alt(self):
a = Tensor([72, -90, 27, 40, -53, 70, 96, 51], dtype=dtypes.int8).bitcast(dtypes.short)
self.assertListEqual(a.tolist(), [-22968, 10267, 18123, 13152])
class TestUint8DType(TestDType):
DTYPE = dtypes.uint8
@unittest.skipIf(getenv("CUDA",0)==1 or getenv("PTX", 0)==1, "cuda saturation works differently")
@unittest.skipIf(getenv("CUDA",0)==1 or isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "cuda saturation works differently")
def test_uint8_to_int8_overflow(self):
_test_op(lambda: Tensor([255, 254, 253, 252], dtype=dtypes.uint8).cast(dtypes.int8), dtypes.int8, [-1, -2, -3, -4])
@ -303,9 +311,11 @@ class TestBitCast(unittest.TestCase):
@given(strat.sampled_from(dtype_ints + dtype_floats), strat.sampled_from(dtype_ints + dtype_floats))
def test_shape_change_bitcast(self, dt1, dt2):
# NOTE: this has to be assume to prevent hypothesis from skipping all samples
assume(not (getenv("PTX") and dt1 == dtypes.int8)) # TODO: bitcasting int8 fails in PTX
assume(not (isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) and dt1 == dtypes.int8)) # TODO: bitcasting int8 fails in PTX
data = rand_for_dtype(dt1, 32).reshape(2, 2, 8)
expected = torch.tensor(data.tolist(), dtype=_to_torch_storage_type(dt1)).view(_to_torch_dtype(dt2))
if dt2 in dtypes.fp8s:
expected = torch.tensor(list(map(lambda x: fp8_to_float(x, dt2), expected.view(-1).tolist()))).view_as(expected)
_test_op(lambda: Tensor(data, dtype=dt1).bitcast(dt2), dt2, expected.tolist())
def test_shape_change_bitcast_exceptions(self):
@ -348,6 +358,9 @@ class TestBoolDType(TestDType): DTYPE = dtypes.bool
class TestBFloat16Type(TestDType): DTYPE = dtypes.bfloat16
class TestFp8e4m3(TestDType): DTYPE = dtypes.fp8e4m3
class TestFp8e5m2(TestDType): DTYPE = dtypes.fp8e5m2
class TestPtrDType(unittest.TestCase):
def test_vec_double(self):
dt1 = dtypes.float.vec(4).ptr().vec(4)
@ -424,7 +437,7 @@ class TestDtypeUsage(unittest.TestCase):
class TestOpsBFloat16(unittest.TestCase):
def test_cast(self):
# TODO: helper_test_op breaks in unrelated part
# TODO: wrong output with GPU=1 on mac
# TODO: wrong output with CL=1 on mac
data = [60000.0, 70000.0, 80000.0]
np.testing.assert_allclose(Tensor(data).cast("bfloat16").numpy(), torch.tensor(data).type(torch.bfloat16).float().numpy())

View file

@ -1,13 +1,14 @@
import unittest, operator, math
from tinygrad import Tensor, dtypes, Device
from tinygrad.dtype import DType
from tinygrad.dtype import DType, truncate
from tinygrad.helpers import CI, getenv
from tinygrad.tensor import _to_np_dtype
from tinygrad.device import is_dtype_supported
from tinygrad.runtime.ops_python import from_storage_scalar
from tinygrad.renderer.ptx import PTXRenderer
import numpy as np
import pytest
from hypothesis import given, strategies as strat, settings, HealthCheck
from hypothesis import assume, given, strategies as strat, settings, HealthCheck
pytestmark = pytest.mark.filterwarnings("ignore")
@ -47,6 +48,8 @@ class ht:
int64 = strat.integers(-9223372036854775808, 9223372036854775807)
bool = strat.booleans()
ht.bfloat16 = ht.uint16
ht.fp8e4m3 = ht.uint8
ht.fp8e5m2 = ht.uint8
def universal_test(a, b, dtype, op):
if not isinstance(op, tuple): op = (op, op)
@ -56,8 +59,9 @@ def universal_test(a, b, dtype, op):
ta, tb = Tensor([a], dtype=dtype), Tensor([b], dtype=dtype)
tensor_value = (op[0](ta, tb)).numpy()
numpy_value = op[1](ta.numpy(), tb.numpy())
if dtype in dtypes.fp8s: numpy_value = truncate[dtype](numpy_value)
if dtype in dtypes.floats:
atol, rtol = {dtypes.bfloat16:(1e-3, 1e-2)}.get(dtype, (1e-10, 1e-7))
atol, rtol = {dtypes.bfloat16:(1e-3, 1e-2), dtypes.fp8e4m3:(1e-1, 1e-1), dtypes.fp8e5m2:(1.0, 5e-1)}.get(dtype, (1e-10, 1e-7))
np.testing.assert_allclose(tensor_value, numpy_value, atol=atol, rtol=rtol)
else: np.testing.assert_equal(tensor_value, numpy_value)
@ -70,8 +74,10 @@ def universal_test_unary(a, dtype, op):
out: Tensor = op[0](ta)
tensor_value = out.numpy()
numpy_value = op[1](ta.numpy())
if dtype in dtypes.fp8s: numpy_value = truncate[dtype](numpy_value)
if dtype in dtypes.floats:
atol, rtol = {dtypes.float16:(1e-3, 1e-2), dtypes.bfloat16:(1e-3, 2e-2)}.get(dtype, (1e-6, 1e-5))
atol, rtol = { dtypes.float16:(1e-3, 1e-2), dtypes.bfloat16:(1e-3, 2e-2),
dtypes.fp8e4m3:(1e-1, 1e-1), dtypes.fp8e5m2: (1.0, 5e-1)}.get(dtype, (1e-6, 1e-5))
np.testing.assert_allclose(tensor_value, numpy_value, atol=atol, rtol=rtol)
else: np.testing.assert_equal(tensor_value, numpy_value)
@ -91,7 +97,7 @@ def universal_test_midcast(a, b, c, op1, op2, d1:DType, d2:DType):
an, bn, cn = np.array([a]).astype(_to_np_dtype(d1)), np.array([b]).astype(_to_np_dtype(d1)), np.array([c]).astype(_to_np_dtype(d2))
tensor_value = op2[0](op1[0](at, bt).cast(d2), ct).numpy()
numpy_value = op2[1](op1[1](an, bn).astype(_to_np_dtype(d2)), cn)
np.testing.assert_allclose(tensor_value, numpy_value, rtol=1e-6 if getenv("PTX") else 1e-7)
np.testing.assert_allclose(tensor_value, numpy_value, rtol=1e-6 if isinstance(Device[Device.DEFAULT].renderer, PTXRenderer) else 1e-7)
class TestDTypeALU(unittest.TestCase):
@unittest.skipUnless(is_dtype_supported(dtypes.float64), f"no float64 on {Device.DEFAULT}")
@ -110,6 +116,16 @@ class TestDTypeALU(unittest.TestCase):
def test_bfloat16(self, a, b, op):
universal_test(from_storage_scalar(a, dtypes.bfloat16), from_storage_scalar(a, dtypes.bfloat16), dtypes.bfloat16, op)
@unittest.skipUnless(is_dtype_supported(dtypes.fp8e4m3), f"no fp8e4m3 on {Device.DEFAULT}")
@given(ht.fp8e4m3, ht.fp8e4m3, strat.sampled_from(binary_operations))
def test_fp8e4m3(self, a, b, op):
universal_test(from_storage_scalar(a, dtypes.fp8e4m3), from_storage_scalar(b, dtypes.fp8e4m3), dtypes.fp8e4m3, op)
@unittest.skipUnless(is_dtype_supported(dtypes.fp8e5m2), f"no fp8e5m2 on {Device.DEFAULT}")
@given(ht.fp8e5m2, ht.fp8e5m2, strat.sampled_from(binary_operations))
def test_fp8e5m2(self, a, b, op):
universal_test(from_storage_scalar(a, dtypes.fp8e5m2), from_storage_scalar(b, dtypes.fp8e5m2), dtypes.fp8e5m2, op)
@given(ht.float32, strat.sampled_from(unary_operations))
def test_float32_unary(self, a, op): universal_test_unary(a, dtypes.float32, op)
@ -121,6 +137,18 @@ class TestDTypeALU(unittest.TestCase):
@given(ht.bfloat16, strat.sampled_from(unary_operations))
def test_bfloat16_unary(self, a, op): universal_test_unary(from_storage_scalar(a, dtypes.bfloat16), dtypes.bfloat16, op)
@unittest.skipUnless(is_dtype_supported(dtypes.fp8e4m3), f"no fp8e4m3 on {Device.DEFAULT}")
@given(ht.fp8e4m3, strat.sampled_from(unary_operations))
def test_fp8e4m3_unary(self, a, op):
if op[1] == np.reciprocal: assume(from_storage_scalar(a, dtype=dtypes.fp8e4m3) != 0.0)
universal_test_unary(from_storage_scalar(a, dtype=dtypes.fp8e4m3), dtypes.fp8e4m3, op)
@unittest.skipUnless(is_dtype_supported(dtypes.fp8e5m2), f"no fp8e5m2 on {Device.DEFAULT}")
@given(ht.fp8e5m2, strat.sampled_from(unary_operations))
def test_fp8e5m2_unary(self, a, op):
if op[1] == np.reciprocal: assume(from_storage_scalar(a, dtype=dtypes.fp8e5m2) != 0.0)
universal_test_unary(from_storage_scalar(a, dtype=dtypes.fp8e5m2), dtypes.fp8e5m2, op)
@given(ht.uint8, ht.uint8, strat.sampled_from(integer_binary_operations))
def test_uint8(self, a, b, op): universal_test(a, b, dtypes.uint8, op)

View file

@ -7,7 +7,7 @@ from tinygrad.engine.realize import lower_schedule
from tinygrad.helpers import prod, unwrap
from test.helpers import REAL_DEV
IMAGE_SUPPORTED_DEVICES = ("QCOM", "GPU")
IMAGE_SUPPORTED_DEVICES = ("QCOM", "CL")
@unittest.skipUnless(REAL_DEV in IMAGE_SUPPORTED_DEVICES, "Images not supported")
class TestImageCopy(unittest.TestCase):

View file

@ -609,21 +609,22 @@ class TestJitFree(unittest.TestCase):
ext_tensor = Tensor([1,24,23,45,1])
@TinyJit
def fxn(x:Tensor):
out = (x*2+ext_tensor).reshape(5,1).expand(5, 100).contiguous()
return out.sum()
t1 = (x * 2).contiguous().realize()
t2 = (t1 + ext_tensor).contiguous().realize()
out = (t2.sum()).contiguous().realize()
return out
for i in range(5):
out = fxn(Tensor([i,1,2,3,4]))
self.assertEqual(out.item(), 11400+200*i)
out = fxn(inp:=Tensor([i,1,2,3,4]))
self.assertEqual(out.item(), 114+2*i)
pre_free = GlobalCounters.mem_used
fxn.captured.free_intermediates()
savings_after_free = pre_free - GlobalCounters.mem_used
# Different allocator implementations have different savings.
expected_savings = 8196 if hasattr(Device[Device.DEFAULT].allocator, '_offset') else 2024
expected_savings = (len(inp) * inp.dtype.itemsize * 2) + dtypes.float32.itemsize # (t1 and t2) + out
self.assertEqual(savings_after_free, expected_savings)
out = fxn(Tensor([11,1,2,3,4]))
self.assertEqual(out.item(), 13600)
self.assertEqual(out.item(), 136)
# Try one more time...
pre_free = GlobalCounters.mem_used
@ -633,7 +634,7 @@ class TestJitFree(unittest.TestCase):
self.assertEqual(savings_after_free, expected_savings)
out = fxn(Tensor([11,1,2,3,4]))
self.assertEqual(out.item(), 13600)
self.assertEqual(out.item(), 136)
def test_updated_not_freed(self):
x = Tensor([1]).realize()

View file

@ -16,14 +16,14 @@ class TestKernelCache(unittest.TestCase):
a1 = Tensor.rand(4,4).realize()
b1 = Tensor.rand(4,4).realize()
orig_compile_func = Device['CPU'].compiler
Device['CPU'].compiler = None # making it not callable
orig_compile_func = Device['CPU'].compiler.compile_cached
Device['CPU'].compiler.compile_cached = None # making it not callable
try:
x1 = a1 + b1 + unique_const
x1.realize() # Same kernel should be from cache.
finally:
Device['CPU'].compiler = orig_compile_func
Device['CPU'].compiler.compile_cached = orig_compile_func
if __name__ == "__main__":
unittest.main()

View file

@ -10,9 +10,10 @@ from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.shape.view import View
from tinygrad.tensor import Tensor, _to_np_dtype
from tinygrad.engine.realize import run_schedule, lower_schedule, CompiledRunner, get_program
from tinygrad.helpers import Context, getenv, flatten, dedup, TC_SELECT, TC_OPT
from tinygrad.helpers import Context, flatten, dedup, TC_SELECT, TC_OPT
from tinygrad.dtype import DType, dtypes, PtrDType, AddrSpace
from tinygrad.codegen import apply_rewrites, rewrites_for_views
from tinygrad.renderer.ptx import PTXRenderer
class TestLinearizer(unittest.TestCase):
def test_arg_dedup(self):
@ -155,7 +156,7 @@ class TestLinearizer(unittest.TestCase):
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
@unittest.skipIf(getenv("PTX") or getenv("NIR"), "broken on ptx/nir because of indexing patterns")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken on ptx for some reason")
def test_upcast_with_locals(self):
x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
r = (x@y).relu()
@ -366,7 +367,7 @@ class TestLinearizer(unittest.TestCase):
helper(Tensor.arange(255), max_ops=2)
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
@unittest.skipIf(getenv("PTX") or getenv("NIR"), "broken on ptx/nir because of indexing patterns")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken on ptx for some reason")
def test_grouped_store_phis(self):
"""
float4 acc0 = float4(0.0,0.0,0.0,0.0);
@ -420,7 +421,7 @@ class TestLinearizer(unittest.TestCase):
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_local, "test requires locals")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.has_shared, "test requires shared")
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "test requires float4")
@unittest.skipIf(getenv("PTX") or getenv("NIR"), "broken on ptx/nir because of indexing patterns")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "broken on ptx for some reason")
def test_grouped_store_local_only(self):
x, y = Tensor.rand(1,128), Tensor.rand(128, 128)
r = (x@y).relu()
@ -481,7 +482,7 @@ def helper_realized_ast(r:Tensor|list[Tensor]) -> tuple[UOp, list[Buffer]]:
assert s[-1].ast.op is Ops.SINK, f"helper_realized_ast expects a SINK {s[-1]}"
# now all input buffers in s[-1] should be realized
# create fresh buffers for the outputs
bufs = [Buffer((x).device, x.size, x.dtype).allocate() if i < len(s[-1].ast.src) else x for i,x in enumerate(s[-1].bufs)]
bufs = [Buffer(x.device, x.size, x.dtype).allocate() if i < len(s[-1].ast.src) else x for i,x in enumerate(s[-1].bufs)]
return push_views(s[-1].ast), bufs
def helper_linearizer_ast(ast:UOp, inputs:list[Tensor], *args, **kwargs):
@ -503,7 +504,7 @@ def reset_bufs(bufs:list[Buffer]):
def _helper_linearizer_opt_ast(realized_ast:UOp, real_bufs:list[Buffer], opts=[],
apply_tc=False, atol=1e-4, rtol=1e-4, color_sizes=[], wanna_output=[]):
outbufs = [real_bufs[x.src[0].base.arg] for x in realized_ast.src]
outbufs = real_bufs[:len(realized_ast.src)]
device = real_bufs[0].device
wanna_output = [np.array(x).flatten() for x in wanna_output]

View file

@ -6,13 +6,12 @@ import unittest
from tinygrad import Device, dtypes
from tinygrad.device import is_dtype_supported
from tinygrad.uop.ops import UOp, Ops, AxisType, KernelInfo
from tinygrad.helpers import getenv
from tinygrad.shape.shapetracker import ShapeTracker, View
from tinygrad.codegen.opt.search import Opt, OptOps
from tinygrad.engine.realize import get_program
from tinygrad.renderer.ptx import PTXRenderer
class TestLinearizerFailure(unittest.TestCase):
@unittest.expectedFailure
@unittest.skipUnless(Device.DEFAULT == "METAL", "only tested on METAL")
def test_failure_beam_mnist(self):
c0 = UOp(Ops.DEFINE_GLOBAL, dtypes.uchar.ptr(4014080), arg=0, src=())
@ -47,7 +46,8 @@ class TestLinearizerDumb(unittest.TestCase):
c10 = UOp(Ops.VIEW, dtypes.void, arg=ShapeTracker(views=(View(shape=(1000, 1000), strides=(0, 0), offset=0, mask=None, contiguous=False),)), src=())
c11 = c1.store((c4.alu(Ops.CMPNE, c7).alu(Ops.CMPNE, UOp.const(dtypes.bool, True, src=c8)).cast(dtypes.int)*(c9.f(Ops.VALID, dtype=dtypes.bool).where(UOp.const(dtypes.int, -1, src=c10), UOp.const(dtypes.int, 0, src=c10)).f(Ops.REDUCE_AXIS, arg=(Ops.ADD, (1,)))+UOp.const(dtypes.int, 1000, src=c8))))
ast = c11.sink()
opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)]
#opts = [Opt(op=OptOps.UNROLL, axis=0, arg=4), Opt(op=OptOps.LOCAL, axis=0, arg=8)]
opts = [Opt(op=OptOps.LOCAL, axis=0, arg=8)]
prg = get_program(ast, Device[Device.DEFAULT].renderer, opts)
print(prg.src)
assert prg.uops is not None and not any(uop.op is Ops.MAX for uop in prg.uops), "leftover MAX"
@ -93,7 +93,7 @@ class TestLinearizerDumb(unittest.TestCase):
@unittest.expectedFailure
@unittest.skipUnless(Device[Device.DEFAULT].renderer.supports_float4, "need float4")
@unittest.skipIf(getenv("PTX"), "this is somehow correct in PTX")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "this is somehow correct in PTX")
def test_upcasted_stores_out_of_order(self):
c0 = UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(9360), arg=0, src=())
c1 = c0.view(ShapeTracker(views=(View(shape=(4, 5, 13, 1, 1, 1, 1, 1, 4, 3, 3), strides=(2340, 468, 36, 0, 0, 0, 0, 0, 9, 3, 1), offset=0, mask=None, contiguous=True),)))

View file

@ -5,9 +5,9 @@ from tinygrad.nn.state import get_state_dict
class TestMethodCache(unittest.TestCase):
def setUp(self):
self.backup_compiler = Device[Device.DEFAULT].compiler
self.backup_compiler = Device[Device.DEFAULT].compiler.compile_cached
def tearDown(self):
Device[Device.DEFAULT].compiler = self.backup_compiler
Device[Device.DEFAULT].compiler.compile_cached = self.backup_compiler
def test_simple_methodcache(self):
a = Tensor([1])
@ -15,19 +15,19 @@ class TestMethodCache(unittest.TestCase):
c = Tensor([3])
d = Tensor([4])
(a+b).realize()
Device[Device.DEFAULT].compiler = None
Device[Device.DEFAULT].compiler.compile_cached = None
(c+d).realize()
def test_nested_methodcache(self):
a,b,c,d = Tensor([1]), Tensor([2]), Tensor([3]), Tensor([4])
((a+b)+(a+b)).realize()
Device[Device.DEFAULT].compiler = None
Device[Device.DEFAULT].compiler.compile_cached = None
((c+d)+(c+d)).realize()
def test_nested_methodcache_swap(self):
a,b,c,d = Tensor([1]), Tensor([2]), Tensor([3]), Tensor([4])
((a+b)+(c+d)).realize()
Device[Device.DEFAULT].compiler = None
Device[Device.DEFAULT].compiler.compile_cached = None
((c+d)+(a+b)).realize()
@unittest.skip("incorrect use of transformer")
@ -38,7 +38,7 @@ class TestMethodCache(unittest.TestCase):
# NOTE: you have to do this twice due to the k-v cache
for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()
for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()
Device[Device.DEFAULT].compiler = None
Device[Device.DEFAULT].compiler.compile_cached = None
for i in range(3): model(Tensor([[1,2,3,4]]), Variable("start_pos", 0, 10).bind(i)).realize()
if __name__ == '__main__':

View file

@ -2,7 +2,7 @@ import unittest, functools, random
from tinygrad import Tensor, Device, nn, GlobalCounters, TinyJit, dtypes, Variable
from tinygrad.device import is_dtype_supported
from tinygrad.uop.ops import Ops, UOp
from tinygrad.helpers import CI, getenv, prod, Context
from tinygrad.helpers import CI, getenv, prod, Context, RANGEIFY
from tinygrad.nn.state import get_parameters, get_state_dict
from tinygrad.engine.realize import lower_schedule, BufferCopy, CompiledRunner, run_schedule
import numpy as np
@ -178,16 +178,14 @@ class TestMultiTensor(unittest.TestCase):
run_schedule(sched)
np.testing.assert_equal(xt.numpy(), X_np[i*2:i*2+2])
@given(strat.sampled_from((4, 5)), strat.sampled_from((devices_2, devices_3)),
@given(strat.sampled_from((devices_2, devices_3)),
strat.sampled_from((Ops.ADD, Ops.MUL, Ops.MAX)),
strat.sampled_from((None, 0, 1)), strat.sampled_from((None, 0, 1)), strat.sampled_from((1, 0, -1)))
def test_simple_reduce(self, N, devices, rop, shard_axis, reduce_axis, sign):
N = N * len(devices)
X = Tensor.rand(N*N).reshape(N, N).mul(sign)
strat.sampled_from((None, 0, 1)), strat.sampled_from((None, 0, 1)))
def test_simple_reduce(self, devices, rop, shard_axis, reduce_axis):
N = 4 * len(devices)
X = (Tensor.rand(N*N)-1).reshape(N, N).shard_(devices, shard_axis)
n = X.numpy()
X.shard_(devices, shard_axis)
f = {Ops.ADD: lambda x: x.sum(reduce_axis), Ops.MUL: lambda x: x.prod(reduce_axis),
Ops.MAX: lambda x: x.max(reduce_axis)}[rop]
f = {Ops.ADD: lambda x: x.sum(reduce_axis), Ops.MUL: lambda x: x.prod(reduce_axis), Ops.MAX: lambda x: x.max(reduce_axis)}[rop]
fX = f(X)
fn = f(n)
np.testing.assert_allclose(fX.numpy(), fn, rtol=1e-6, atol=1e-6)
@ -374,6 +372,7 @@ class TestMultiTensor(unittest.TestCase):
# NOTE: this is failing on LLVM CI, no idea why. Works locally.
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
@unittest.skipIf(RANGEIFY, "TODO: pm_rangeify hangs")
def test_data_parallel_resnet(self):
from extra.models.resnet import ResNet18
@ -410,6 +409,7 @@ class TestMultiTensor(unittest.TestCase):
np.testing.assert_allclose(grad, shard_grad, atol=1e-5, rtol=1e-5)
@unittest.skipIf(CI and REAL_DEV in ("CUDA", "NV", "CPU", "AMD"), "slow, and flaky on CPU")
@unittest.skipIf(RANGEIFY, "TODO: pm_rangeify hangs")
def test_data_parallel_resnet_train_step(self):
from extra.models.resnet import ResNet18
fake_image = Tensor.rand((2, 3, 224//16, 224//16))
@ -417,6 +417,7 @@ class TestMultiTensor(unittest.TestCase):
m = ResNet18()
self._test_model_train_step(m, fake_image, labels)
@unittest.skipIf(RANGEIFY, "TODO: pm_rangeify hangs")
def test_data_parallel_simple_train_step(self):
class Model:
def __init__(self): self.conv1 = nn.Linear(128,128)

View file

@ -229,7 +229,8 @@ class TestNN(unittest.TestCase):
torch_z = torch_layer(torch_x)
torch_z.sum().backward()
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-6, rtol=5e-6)
# TODO: why is torch numbers all 0?
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-4, rtol=5e-6)
def test_layernorm(self):
N, C, H, W = 20, 5, 10, 10
@ -332,7 +333,7 @@ class TestNN(unittest.TestCase):
np.testing.assert_allclose(z.numpy(), torch_z.detach().numpy(), atol=5e-6, rtol=5e-6)
np.testing.assert_allclose(x.grad.numpy(), torch_x.grad.detach().numpy(), atol=1e-3, rtol=1e-3)
np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=2e-3, rtol=1e-3)
np.testing.assert_allclose(layer.weight.grad.numpy(), torch_layer.weight.grad.detach().numpy(), atol=3e-3, rtol=1e-3)
np.testing.assert_allclose(layer.bias.grad.numpy(), torch_layer.bias.grad.detach().numpy(), atol=1e-3, rtol=1e-3)
def test_rmsnorm(self):

View file

@ -2,7 +2,7 @@ import time, math, unittest, functools, platform, warnings
import numpy as np
from typing import List, Callable
import torch
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, CPU_LLVM, AMD_LLVM
from tinygrad.helpers import getenv, IMAGE, DEBUG, CI, Context, TRANSCENDENTAL, CPU_LLVM, AMD_LLVM, RANGEIFY
from tinygrad import Tensor, Device, dtypes
from tinygrad.tensor import _to_np_dtype
from tinygrad.device import is_dtype_supported
@ -234,7 +234,8 @@ class TestOps(unittest.TestCase):
def test_unfold(self):
helper_test_op([(8,)], lambda x: x.unfold(0, 2, 1))
helper_test_op([(8,)], lambda x: x.unfold(0, 2, 2))
helper_test_op([(8,)], lambda x: x.unfold(0, 7, 3))
# TODO: something is wrong with unfold
if not getenv("TINY_BACKEND"): helper_test_op([(8,)], lambda x: x.unfold(0, 7, 3))
helper_test_op([(3,3,3)], lambda x: x.unfold(2, 2, 8))
helper_test_op([(3,3,3)], lambda x: x.unfold(1, 0, 8))
helper_test_op([(3,3,3,3,3)], lambda x: x.unfold(-1, 2, 2))
@ -311,6 +312,11 @@ class TestOps(unittest.TestCase):
helper_test_op([], lambda: torch.nn.functional.pad(torch.ones(256,256), pad=(0,64,0,0)).sum(axis=1),
lambda: Tensor.ones(256,256).pad(((0,0), (0,64))).sum(axis=1), forward_only=True)
def test_sum_twice(self):
helper_test_op([(4, 4, 4)], lambda x: x.sum((0, 1)).sum())
helper_test_op([(4, 4, 4)], lambda x: x.sum((0, 2)).sum())
helper_test_op([(4, 4, 4)], lambda x: x.sum((1, 2)).sum())
# this is more complex and won't fold for a while
def test_sum_cat_collapse(self):
helper_test_op([], lambda: torch.cat([torch.ones(256,256), torch.zeros(256,64)], dim=1).sum(axis=1),
@ -1304,7 +1310,7 @@ class TestOps(unittest.TestCase):
np.arange(64,128,dtype=np.float32).reshape(8,8)])
def test_small_gemm_eye(self):
helper_test_op(None, lambda x,y: x.matmul(y), lambda x,y: x@y, vals=[np.eye(8).astype(np.float32), np.eye(8).astype(np.float32)])
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "GPU", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
@unittest.skipIf(CI and Device.DEFAULT in ["NV", "CL", "CUDA"] or (Device.DEFAULT == "CPU" and CPU_LLVM) or IMAGE
or (Device.DEFAULT == "WEBGPU" and platform.system() == "Windows"), "not supported on these in CI/IMAGE")
def test_gemm_fp16(self):
helper_test_op([(64,64), (64,64)], lambda x,y: x.half().matmul(y.half()), atol=5e-3, rtol=5e-3)
@ -1407,6 +1413,11 @@ class TestOps(unittest.TestCase):
helper_test_op(None, lambda x: x.max(), forward_only=True, vals=[[False, True]])
helper_test_op(None, lambda x: x.max(), forward_only=True, vals=[[True, False]])
def test_const_reduce(self):
helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).sum(), lambda x: (x.full_like(2)).sum(), forward_only=True)
helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).prod(), lambda x: (x.full_like(2)).prod(), forward_only=True)
helper_test_op([(3,3)], lambda x: torch.full_like(x, 2).max(), lambda x: (x.full_like(2)).max(), forward_only=True)
@unittest.skipIf(Device.DEFAULT == "QCOM", "OpenCL fails to compile this (both on GPU(qcom)/QCOM backends)")
def test_any(self):
helper_test_op([(3,4,5,6)], lambda x: x.any(), forward_only=True)
@ -1458,6 +1469,7 @@ class TestOps(unittest.TestCase):
def test_mean_zero_axis(self):
helper_test_op([(1,0,3,0,5)], lambda x: x.mean(axis=(1,3)))
@slow_test
def test_var(self):
helper_test_op([(15, 25, 35)], lambda x: x.var())
helper_test_op([(15, 25, 35)], lambda x: x.var(correction=0))
@ -1493,6 +1505,7 @@ class TestOps(unittest.TestCase):
helper_test_op([(15, 25, 35)], lambda x: x.var(keepdim=True))
helper_test_op([(15, 25, 35)], lambda x: x.var(0, keepdim=True, correction=0))
@slow_test
def test_std(self):
helper_test_op([(15, 25, 35)], lambda x: x.std())
helper_test_op([(15, 25, 35)], lambda x: x.std(correction=0))
@ -1525,6 +1538,7 @@ class TestOps(unittest.TestCase):
def test_std_keepdim(self):
helper_test_op([(15, 25, 35)], lambda x: x.std(keepdim=True))
helper_test_op([(15, 25, 35)], lambda x: x.std(0, keepdim=True, correction=0))
@slow_test
def test_std_mean(self):
helper_test_op([(15,25,35)], lambda x: torch.stack(torch.std_mean(x)),
lambda x: Tensor.stack(*x.std_mean()))
@ -2040,12 +2054,14 @@ class TestOps(unittest.TestCase):
lambda x,w,b: torch.nn.functional.conv2d(x,w,b),
lambda x,w,b: Tensor.conv2d(x,w,b), grad_rtol=1e-5)
@slow_test
@unittest.skipIf(IMAGE>0, "no conv3d on images")
def test_simple_conv3d(self):
helper_test_op([(1,4,9,9,9), (4,4,3,3,3)],
lambda x,w: torch.nn.functional.conv3d(x,w),
lambda x,w: Tensor.conv2d(x,w), grad_rtol=1e-5)
@slow_test
@unittest.skipIf(IMAGE>0, "no conv3d on images")
def test_padded_conv3d(self):
helper_test_op([(1,4,5,5,5), (4,4,3,3,3)],
@ -2102,6 +2118,7 @@ class TestOps(unittest.TestCase):
lambda x,w: torch.nn.functional.conv_transpose2d(x,w,groups=2),
lambda x,w: Tensor.conv_transpose2d(x,w,groups=2), grad_rtol=1e-5)
@slow_test
def test_padded_conv_transpose2d(self):
for padding in [(1,2), (2,1), 2, 1, 0]:
helper_test_op([(2,4,9,9), (4,4,3,3)],
@ -2110,6 +2127,7 @@ class TestOps(unittest.TestCase):
self.helper_test_exception([(2,16,2,2), (32,16,3,3)], lambda x,w: torch.nn.functional.conv_transpose2d(x,w,padding=(1,1,1)),
lambda x,w: Tensor.conv_transpose2d(x,w,padding=(1,1,1)), expected=(RuntimeError, ValueError))
@slow_test
def test_dilated_conv_transpose2d(self):
for dilation in [(1,2), (2,1), 2, 1]:
helper_test_op([(2,4,9,9), (4,4,3,3)],
@ -2122,6 +2140,7 @@ class TestOps(unittest.TestCase):
lambda x,w: torch.nn.functional.conv_transpose2d(x,w, stride=stride),
lambda x,w: Tensor.conv_transpose2d(x,w,stride=stride), atol=1e-5, grad_rtol=1e-5)
@slow_test
def test_output_padded_conv_transpose2d(self):
for output_padding, stride in [((1,1), (2,3)), ((2,1), (3,2))]:
helper_test_op([(2,4,6,5), (4,4,3,3),(4,)],
@ -2183,8 +2202,10 @@ class TestOps(unittest.TestCase):
lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups),
lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5)
def test_conv2d(self): self._test_conv2d(bs=1, cin=3)
@slow_test
def test_conv2d_bs_4_cin_3(self): self._test_conv2d(bs=4, cin=3, cout=2)
def test_conv2d_bs_1_cin_1(self): self._test_conv2d(bs=1, cin=1)
@slow_test
def test_conv2d_bs_4_cin_1(self): self._test_conv2d(bs=4, cin=1)
def test_conv2d_errors(self):
@ -2256,6 +2277,7 @@ class TestOps(unittest.TestCase):
lambda x,w: torch.nn.functional.conv2d(x,w,groups=groups),
lambda x,w: Tensor.conv2d(x,w,groups=groups), grad_rtol=1e-5)
@slow_test
def test_strided_conv2d_simple(self):
bs,H,W = 2,3,1
helper_test_op([(bs,1,5,1), (1,1,H,W)],
@ -2266,6 +2288,7 @@ class TestOps(unittest.TestCase):
def test_strided_conv2d_simple_vec(self):
with Context(DEVECTORIZE=0): self.test_strided_conv2d_simple()
@slow_test
def test_strided_conv2d(self):
bs = 4
cin = 3
@ -2501,6 +2524,7 @@ class TestOps(unittest.TestCase):
),
forward_only=True)
@slow_test
def test_avg_pool2d(self):
shape = (32,2,111,28)
for ksz in [(2,2), (3,3), (3,2), (5,5), (5,1)]:
@ -2928,13 +2952,13 @@ class TestOps(unittest.TestCase):
@slow_test
def test_scatter_reduce(self):
b = torch.randint(3, size=[3,4,5], dtype=torch.int64, requires_grad=False)
a = Tensor(b.detach().cpu().numpy().astype(np.int32), dtype=dtypes.int32, requires_grad=False)
a = Tensor(b.detach().cpu().numpy().astype(np.int32), requires_grad=False)
for reduce in ("sum", "prod", "mean", "amin", "amax"):
for dim in (-1,1,-3):
helper_test_op([(4,5,6), (4,5,6)],
helper_test_op([(3,4,5), (3,4,5)],
lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce),
lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce), forward_only=True)
helper_test_op([(4,5,6), (4,5,6)],
helper_test_op([(3,4,5), (3,4,5)],
lambda x,src: x.scatter_reduce(dim=dim, index=b, src=src, reduce=reduce, include_self=False),
lambda x,src: x.scatter_reduce(dim=dim, index=a, src=src, reduce=reduce, include_self=False), forward_only=True)
@ -3015,6 +3039,8 @@ class TestOps(unittest.TestCase):
helper_test_op([(32,10), (32,10)], lambda x,y: torch.nn.functional.binary_cross_entropy_with_logits(x,y.clip(0,1),
pos_weight=torch.tensor(pos_weight)),
lambda x,y: x.binary_crossentropy_logits(y.clip(0,1),pos_weight=Tensor(pos_weight)))
@unittest.skipIf(RANGEIFY > 1, "broken on RANGEIFY > 1, TODO: fix")
def test_cross_entropy_class_probabilities(self):
helper_test_op([(32,), (32,)], lambda x,y: torch.nn.functional.cross_entropy(x, y), lambda x,y: x.cross_entropy(y))
helper_test_op([(32,10), (32,10)], lambda x,y: torch.nn.functional.cross_entropy(x, y), lambda x,y: x.cross_entropy(y))

View file

@ -13,7 +13,7 @@ class TestOpts(unittest.TestCase):
out = (a+b).contiguous(arg=opts)
s = out.schedule()
self.assertEqual(s[-1].ast.arg.opts_to_apply, opts)
if Device.DEFAULT in {"CPU", "GPU", "METAL"} and not CPU_LLVM:
if Device.DEFAULT in {"CPU", "CL", "METAL"} and not CPU_LLVM:
prg = get_program(s[-1].ast)
self.assertIn('float4', prg.src)

View file

@ -17,7 +17,7 @@ def helper_collect_profile(*devs):
cpu_events.clear()
profile_list = []
with Context(PROFILE=1):
with Context(VIZ=1):
yield profile_list
for dev in devs: dev.synchronize()
for dev in devs: dev._at_profile_finalize()

View file

@ -1,15 +1,17 @@
import unittest, math
from functools import partial
import numpy as np
import torch
from tinygrad import nn, dtypes, Tensor, Device, TinyJit
from tinygrad.helpers import getenv, CI
from tinygrad import nn, dtypes, Tensor, Device, TinyJit, Variable
from tinygrad.helpers import getenv, CI, OSX
from tinygrad.device import is_dtype_supported
from tinygrad.engine.realize import lower_schedule, CompiledRunner
from hypothesis import given, settings, strategies as strat
from tinygrad.renderer.ptx import PTXRenderer
from test.helpers import not_support_multi_device
import numpy as np
import torch
from hypothesis import given, settings, strategies as strat
settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
settings.load_profile("my_profile")
@ -98,7 +100,7 @@ class TestRandomness(unittest.TestCase):
np.testing.assert_allclose(jr, r)
@unittest.skipIf(getenv("PTX"), "fails with PTX")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "fails with PTX")
def test_threefry_doesnt_use_long(self):
for (_,ei) in lower_schedule(Tensor.rand(20).schedule()):
if isinstance(ei.prg, CompiledRunner):
@ -359,5 +361,20 @@ class TestRandomness(unittest.TestCase):
assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).weight, lambda _: torch.nn.BatchNorm2d(*params).weight.detach())
assert equal_distribution(lambda *_: nn.BatchNorm2d(*params).bias, lambda _: torch.nn.BatchNorm2d(*params).bias.detach())
# TODO: still fails with MAX_KERNEL_BUFFERS
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
class TestSample(unittest.TestCase):
def test_sample(self):
X = Tensor.rand(10000, 50).realize()
BS = 16
idxs = np.random.randint(0, X.shape[0], size=(BS))
# this uncovered a bug with arg sort order
batch = [Variable(f'idx{i}', 0, X.shape[0]-1).bind(s) for i,s in enumerate(idxs.tolist())]
x = Tensor.cat(*[X.shrink(((batch[i], batch[i]+1), None)) for i in range(BS)])
print(idxs)
ret = x.numpy()
base = X.numpy()[idxs]
np.testing.assert_equal(ret, base)
if __name__ == "__main__":
unittest.main()

View file

@ -1,8 +1,21 @@
import unittest
from tinygrad import Tensor
from tinygrad import Tensor, nn
from tinygrad.helpers import RANGEIFY, Context, GlobalCounters
from tinygrad.uop.ops import UOp
@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
class TestRangeifyAssign(unittest.TestCase):
def test_assign_permuted(self):
A = Tensor.empty(4, 4, dtype='int')
B = Tensor.arange(16).reshape(4,4)
ret = A.permute(1,0).assign(B)
lst = ret.tolist()
lst2 = A.tolist()
lst3 = B.tolist()
print(lst)
print(lst2)
print(lst3)
N = 256
@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
@ -93,6 +106,16 @@ class TestRangeify(unittest.TestCase):
w2 = Tensor.empty(12, 8, 3, 3)
x.conv2d(w1).conv2d(w2).realize()
def test_conv_maxpool_contig(self): self.test_conv_maxpool(True)
def test_conv_maxpool(self, contig=False):
GlobalCounters.reset()
x = Tensor.empty(32, 16, 64, 64)
l1 = nn.Conv2d(16, 16, 3)
for p in nn.state.get_parameters(l1): p.replace(Tensor.empty(p.shape))
x = l1(x)
if contig: x = x.contiguous()
x.max_pool2d().realize()
def test_double_conv2d_half_contig(self):
x = Tensor.empty(1, 4, 32, 32)
w1 = Tensor.empty(8, 4, 3, 3)
@ -144,6 +167,7 @@ class TestRangeify(unittest.TestCase):
# contiguous + reduce can support ranges?
@unittest.skip("okay to disable this for now")
@unittest.skipIf(RANGEIFY<1, "tests only for RANGEIFY")
class TestOuterworld(unittest.TestCase):
def test_passthrough_range(self):

View file

@ -1,22 +0,0 @@
import unittest
import numpy as np
from tinygrad import Tensor, Variable, Device
from tinygrad.helpers import OSX
# TODO: still fails with MAX_KERNEL_BUFFERS
@unittest.skipIf(Device.DEFAULT == "WEBGPU" and not OSX, "WEBGPU Vulkan can only run kernels with up to 10 buffers")
class TestSample(unittest.TestCase):
def test_sample(self):
X = Tensor.rand(10000, 50).realize()
BS = 16
idxs = np.random.randint(0, X.shape[0], size=(BS))
# this uncovered a bug with arg sort order
batch = [Variable(f'idx{i}', 0, X.shape[0]-1).bind(s) for i,s in enumerate(idxs.tolist())]
x = Tensor.cat(*[X.shrink(((batch[i], batch[i]+1), None)) for i in range(BS)])
print(idxs)
ret = x.numpy()
base = X.numpy()[idxs]
np.testing.assert_equal(ret, base)
if __name__ == '__main__':
unittest.main()

View file

@ -12,9 +12,9 @@ from tinygrad import nn, dtypes, Device, Tensor
from tinygrad.device import is_dtype_supported
from tinygrad.dtype import DType, ImageDType
from tinygrad.shape.shapetracker import ShapeTracker
from tinygrad.uop.ops import PatternMatcher, UOp, Ops, GroupOp, UPat, graph_rewrite, track_rewrites
from tinygrad.uop.ops import UOp, Ops, GroupOp, UPat, graph_rewrite, track_rewrites
from tinygrad.uop.symbolic import symbolic_simple
from tinygrad.helpers import CI, DEBUG, FUSE_ARANGE, SPLIT_REDUCEOP, GlobalCounters, Context, getenv, all_same, temp
from tinygrad.helpers import CI, DEBUG, SPLIT_REDUCEOP, GlobalCounters, Context, getenv, all_same, temp, RANGEIFY
from tinygrad.schedule.kernelize import merge_views, get_kernelize_map, Kernel
from tinygrad.engine.schedule import create_schedule_with_vars
from tinygrad.engine.realize import CompiledRunner, run_schedule, lower_schedule
@ -33,6 +33,7 @@ def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Te
# test lowering all the ScheduleItems to ExecItems
kernel_cnt = len([si for si,ei in lower_schedule(sched.copy()) if isinstance(ei.prg, CompiledRunner) or not filter_sink])
if kernel_cnt != allowed:
if RANGEIFY: return sched # allow different kernel count, TODO: fix the asserts
print(f"SCHEDULE ISSUE, expecting {allowed} got {len(sched)}")
if DEBUG >= 3:
for i,s in enumerate(sched):
@ -41,6 +42,8 @@ def check_schedule(t:Tensor|list[Tensor]|UOp, allowed:int, to_prerealize:list[Te
raise KernelCountException(f"{kernel_cnt} != {allowed}")
return sched
def expect_rangeify_fails(fxn): return (unittest.expectedFailure if RANGEIFY else (lambda f:f))(fxn)
def _realize_weights(m):
for p in nn.state.get_parameters(m): p.realize()
@ -111,6 +114,7 @@ class TestSchedule(unittest.TestCase):
self.assertListEqual(a.tolist(), [[15]])
@unittest.skipIf(Device.DEFAULT == "CPU", "devices must mismatch")
@expect_rangeify_fails
def test_error_on_device_mismatch(self):
a = Tensor.empty(10)
b = Tensor.empty(10, device="CPU")
@ -118,11 +122,12 @@ class TestSchedule(unittest.TestCase):
with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 1)
@unittest.skipIf(Device.DEFAULT == "CPU", "devices must mismatch")
@expect_rangeify_fails
def test_error_on_device_mismatch_alt(self):
a = Tensor.empty(10)
b = Tensor.empty((1,), device="CPU").expand(10).contiguous()
c = a+b
with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 1)
with self.assertRaisesRegex(RuntimeError, "all buffers must be on the same device"): check_schedule(c, 2 if RANGEIFY else 1)
@unittest.skipUnless(is_dtype_supported(dtypes.half) and getenv("CAST_AFTER_EXPAND"), "need half and CAST_AFTER_EXPAND=1")
@unittest.skip("CAST_AFTER_EXPAND is not supported")
@ -140,6 +145,7 @@ class TestSchedule(unittest.TestCase):
np.testing.assert_equal(xt.numpy(), X.numpy()[1][0])
@unittest.skipIf(CI and Device.DEFAULT == "NV", "crashes on NV CI")
@unittest.skipIf(RANGEIFY, "rangeify doesn't implement input buffer limiting")
def test_add_chain_buffers(self):
N = 31
with Context(TRACK_MATCH_STATS=0, DEBUG=0):
@ -198,9 +204,10 @@ class TestSchedule(unittest.TestCase):
def test_simplify_padded_const(self):
a = Tensor.empty(1022).cummax(axis=0)
sched = check_schedule(a, 5)
ast = sched[0].ast
self.assertLessEqual(len([u for u in ast.toposort() if u.op is Ops.WHERE]), 6)
check_schedule(a, 5)
# TODO: what is this testing?
#ast = sched[0].ast
#self.assertLessEqual(len([u for u in ast.toposort() if u.op is Ops.WHERE]), 6)
def test_basic_binop_fusion(self):
a = Tensor.empty(10)
@ -278,7 +285,7 @@ class TestSchedule(unittest.TestCase):
a = Tensor.empty(10,10,10)
b = Tensor.empty(10,10,1)
c = a.sum(axis=0, keepdim=True).permute(2,1,0) + b
with self.assertRaises(KernelCountException): check_schedule(c, 1)
check_schedule(c, 2)
def test_allow_push_permutes(self):
a = Tensor.randn(10,10,10).realize()
@ -316,7 +323,7 @@ class TestSchedule(unittest.TestCase):
b = Tensor.empty(10)
c = a+b
d = a.reshape(10,1)+b.reshape(10,1)
with self.assertRaises(KernelCountException): check_schedule(d, 0, [c])
check_schedule(d, 1, [c])
# failing in new lazy
def test_cache_binaryop_transpose(self):
@ -324,7 +331,7 @@ class TestSchedule(unittest.TestCase):
b = Tensor.empty(10,10)
c = (a.T*b.T).T #.contiguous()
d = a*b
with self.assertRaises(KernelCountException): check_schedule(d, 0, [c])
check_schedule(d, 1, [c])
def test_cache_two_reduceops(self):
a = Tensor.empty(10)
@ -339,7 +346,7 @@ class TestSchedule(unittest.TestCase):
r1 = (x - r0).sum(axis=0).div(2)
out = r0 + r1
schedule = check_schedule(out, 2)
reduceops = [x for si in schedule for x in si.ast.toposort() if x.op is Ops.REDUCE_AXIS]
reduceops = [x for si in schedule for x in si.ast.toposort() if x.op in {Ops.REDUCE_AXIS, Ops.REDUCE}]
assert len(reduceops) == 2
def test_cache_reduce_multiple_children(self):
@ -349,9 +356,9 @@ class TestSchedule(unittest.TestCase):
r1 = (x - r0).sum(axis=0).div(2)
out0 = r0 + y
out1 = r1 + y
schedule = check_schedule([out0, out1], 4)
reduceops = [x for si in schedule for x in si.ast.toposort() if x.op is Ops.REDUCE_AXIS]
assert len(reduceops) == 2
schedule = check_schedule([out0, out1], 2 if RANGEIFY else 4)
reduceops = [x for si in schedule for x in si.ast.toposort() if x.op in {Ops.REDUCE_AXIS, Ops.REDUCE}]
assert len(reduceops) == (3 if RANGEIFY else 2)
def test_div_collapse_buffer(self):
a = Tensor.full((4,), 4.0).contiguous().realize()
@ -394,6 +401,7 @@ class TestSchedule(unittest.TestCase):
# a and b share the same underlying device memory
self.assertIs(a.uop.realized, b.uop.realized)
@expect_rangeify_fails
def test_clone_doesnt_dedup(self):
src = Tensor.ones(4).contiguous().realize()
a = src.clone()
@ -417,6 +425,11 @@ class TestSchedule(unittest.TestCase):
b = Tensor.full((4, 4), 1.).contiguous().realize()
check_schedule([a+b, a+b], 1)
def test_const_realize(self):
t = Tensor.ones(2)
check_schedule(t[0], 0)
check_schedule(t[1], 0)
def test_fold_double_unary(self):
y = Tensor.empty(2)
out = y.sum(keepdim=True).sqrt().neg()
@ -558,7 +571,7 @@ class TestSchedule(unittest.TestCase):
c = a+b
d = a.reshape(10,1)+b.reshape(10,1)
out = c.sum() + d.sum()
with self.assertRaises(KernelCountException): check_schedule(out, 1)
check_schedule(out, 2)
def test_children_dont_push(self):
a = Tensor.empty(10, 10, 1)
@ -569,6 +582,7 @@ class TestSchedule(unittest.TestCase):
check_schedule(f, 2)
# failing in new lazy
@unittest.skip("always fusing elementwise")
def test_dont_fuse_binops_with_children(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
@ -576,8 +590,8 @@ class TestSchedule(unittest.TestCase):
keep_me = a+b
e = keep_me.sum() # noqa: F841 give keep_me a child (NOTE: BinaryOps won't be a child since it will instant fuse)
d = keep_me+c
with self.assertRaises(KernelCountException): check_schedule(d, 2)
with self.assertRaises(KernelCountException): check_schedule(keep_me, 0, [d])
check_schedule(d, 2)
check_schedule(keep_me, 0, [d])
#@unittest.skip("failing in old lazy")
def test_permute_breaks_fusion(self):
@ -627,7 +641,8 @@ class TestSchedule(unittest.TestCase):
x = x.image_conv2d(w3, b3)
# NOOP, 3 convs, contiguous
with self.assertRaises(KernelCountException): check_schedule(x, 5)
#check_schedule(x, 5)
check_schedule(x, 8)
def test_image_conv_fusion_minimal(self):
b1 = Tensor.empty(16)
@ -682,6 +697,7 @@ class TestSchedule(unittest.TestCase):
c = (a.sum(2).contiguous() + b).contiguous()
check_schedule(c, 2)
@expect_rangeify_fails
def test_kernelize(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
@ -689,12 +705,14 @@ class TestSchedule(unittest.TestCase):
d = c+2
check_schedule(d, 2)
@expect_rangeify_fails
def test_kernelize_view(self):
a = Tensor.empty(4,1)
b = a*2
c = b.kernelize()+Tensor.empty(4,4)
check_schedule(c, 2)
@expect_rangeify_fails
def test_kernelize_diamond(self):
a = Tensor([0]).realize()
prev_a = (a+1).contiguous()
@ -703,6 +721,7 @@ class TestSchedule(unittest.TestCase):
assert prev_a.uop in a.uop.src, "contiguous usage must run before assign"
self.assertEqual((prev_a+a*3).item(), 1+2*3)
@expect_rangeify_fails
def test_multioutput_ast(self):
a = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
b = Tensor.zeros(1, dtype=dtypes.int).contiguous().realize().uop
@ -714,6 +733,7 @@ class TestSchedule(unittest.TestCase):
self.assertEqual(b.buffer.numpy(), [12])
# unlike schedule, kernelize can be called multiple times on a Tensor
@expect_rangeify_fails
def test_double_kerenlize(self):
a = Tensor.empty(10)
b = Tensor.empty(10)
@ -722,6 +742,7 @@ class TestSchedule(unittest.TestCase):
e = c.kernelize()+d.kernelize()
check_schedule(e, 3)
@expect_rangeify_fails
def test_kernelize_bw(self):
a = Tensor.full((3,), 2.0, requires_grad=True).contiguous()
b = Tensor.full((3,), 3.0, requires_grad=True).contiguous()
@ -732,6 +753,7 @@ class TestSchedule(unittest.TestCase):
self.assertEqual(z.item(), 18.0)
self.assertEqual(z.grad.item(), 1.0)
@expect_rangeify_fails
def test_kernelize_bw_view(self):
a = Tensor.full((3,1), 2.0, requires_grad=True).contiguous()
b = Tensor.full((3,1), 3.0, requires_grad=True).contiguous()
@ -784,6 +806,13 @@ class TestSchedule(unittest.TestCase):
out = x + 1
check_schedule(out, 0, filter_sink=False)
def test_zero_size_assign(self):
f = Tensor.full((2,), 0.).contiguous().realize()
a = f.shrink_to((0,))
a.assign(Tensor.ones_like(a))
check_schedule(a, 0)
self.assertEqual(a.tolist(), [])
def test_reduce_permute_nofuse(self):
x = Tensor.empty(32, 32, 32)
y = Tensor.empty(32, 32)
@ -888,26 +917,24 @@ class TestSchedule(unittest.TestCase):
out = x.contiguous() + y.contiguous()
check_schedule(out, 2, filter_sink=False)
@unittest.expectedFailure
def test_reduce_same_size(self):
Tensor.manual_seed(0)
a = Tensor.randn(4, 4).realize()
out0 = a.sum() + 2
out1 = a.sum() + 4
out2 = out0 * out1
run_schedule(check_schedule([out0, out1, out2], 1))
run_schedule(check_schedule([out0, out1, out2], 1 if RANGEIFY else 4))
np.testing.assert_allclose(out0.numpy(), out0_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-6)
np.testing.assert_allclose(out1.numpy(), out1_np:=a.numpy().sum()+4, atol=1e-4, rtol=1e-6)
np.testing.assert_allclose(out2.numpy(), out0_np*out1_np, atol=1e-4, rtol=1e-6)
@unittest.expectedFailure
def test_reduce_multiple_paths(self):
Tensor.manual_seed(0)
a = Tensor.randn(4, 4).realize()
out0 = a.sum().exp2()
# out1 has two paths to a.sum()
out1 = a.sum() + out0
run_schedule(check_schedule([out0, out1], 1))
run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
np.testing.assert_allclose(out0.numpy(), out0_np:=np.exp2(a.numpy().sum()), atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(out1.numpy(), a.numpy().sum()+out0_np, atol=1e-4, rtol=1e-6)
@ -983,7 +1010,6 @@ class TestSchedule(unittest.TestCase):
np.testing.assert_allclose(e.numpy(), e_np:=b.numpy() + out0_np, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(out1.numpy(), r_np + e_np[0][0][0], atol=1e-4, rtol=1e-4)
# changed by multireduce
def test_reduce_expand_child(self):
Tensor.manual_seed(0)
a = Tensor.randn((32, 32, 32)).realize()
@ -995,13 +1021,12 @@ class TestSchedule(unittest.TestCase):
np.testing.assert_allclose(out0.numpy(), a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(out1.numpy(), a.numpy().sum()+b.numpy(), atol=1e-4, rtol=1e-4)
@unittest.expectedFailure
def test_reduce_shrink_child(self):
a = Tensor.empty(100, 100)
b = Tensor.empty(10,)
c = a.sum() + b[0]
d = a.sum() + 2
check_schedule([c, d], 1)
check_schedule([c, d], 1 if RANGEIFY else 3)
def test_reduce_multiple_paths_midshrink(self):
a = Tensor.empty(4, 4)
@ -1024,20 +1049,6 @@ class TestSchedule(unittest.TestCase):
run_schedule(check_schedule(out, 2))
np.testing.assert_allclose(out.numpy(), x.numpy().std(axis=-1, ddof=1), atol=1e-4, rtol=1e-4)
def test_argmin_multireduce_fusion(self):
Tensor.manual_seed(0)
x = Tensor.randn(4, 32).realize()
out = x.argmin(-1)
run_schedule(check_schedule(out, 2))
np.testing.assert_equal(out.numpy(), x.numpy().argmin(axis=-1))
def test_argmax_multireduce_fusion(self):
Tensor.manual_seed(0)
x = Tensor.randn(4, 32).realize()
out = x.argmax(-1)
run_schedule(check_schedule(out, 2))
np.testing.assert_equal(out.numpy(), x.numpy().argmax(axis=-1))
def test_scaled_dot_product_attention_multireduce_fusion(self):
Tensor.manual_seed(0)
q = Tensor.randn(32,8,16,8).realize()
@ -1179,13 +1190,14 @@ class TestSchedule(unittest.TestCase):
np.testing.assert_allclose(out.numpy(), expected, atol=1e-4, rtol=1e-4)
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
@expect_rangeify_fails
def test_softmax_upcast(self):
# input half, softmax in float
Tensor.manual_seed(0)
x = Tensor.randn(4, 12, 64, 64, dtype=dtypes.half).realize()
out = x.softmax(dtype=dtypes.float)
sched = out.schedule()
self.assertEqual(len(sched), 3)
self.assertEqual(len(sched), 2 if RANGEIFY else 3)
self.assertEqual(sched[0].bufs[0].dtype, dtypes.half)
# input float, softmax in float
@ -1202,7 +1214,6 @@ class TestSchedule(unittest.TestCase):
x.softmax().sum().backward()
run_schedule(check_schedule(x.grad, 4))
# changed by: multireduce spec
def test_layernorm_onelayer_fusion(self):
Tensor.manual_seed(0)
layer = nn.LayerNorm([10, 10])
@ -1316,6 +1327,7 @@ class TestSchedule(unittest.TestCase):
with Context(FUSE_CONV_BW=1): check_schedule(opt.schedule_step(), 14)
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
@expect_rangeify_fails
def test_prefer_half_buffer(self):
x = Tensor.ones(4).contiguous().realize()
# y = Tensor.ones(4).contiguous().realize()
@ -1433,7 +1445,6 @@ class TestSchedule(unittest.TestCase):
run_schedule(schedule)
np.testing.assert_allclose(b.numpy(), a.numpy().sum(0)+a.numpy().max(0) + a.numpy().max(1)+a.numpy().sum(1)+2, atol=1e-4, rtol=1e-4)
# changed by: multireduce spec
# pattern in test_transformer
def test_partial_fuse1(self):
Tensor.manual_seed(0)
@ -1446,7 +1457,6 @@ class TestSchedule(unittest.TestCase):
np.testing.assert_allclose(c.numpy(), a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(d.numpy(), (a.numpy().sum() - b.numpy().sum()) * 4, atol=1e-4, rtol=1e-4)
# changed by: multireduce spec
# pattern in conv
def test_partial_fuse2(self):
Tensor.manual_seed(0)
@ -1459,9 +1469,7 @@ class TestSchedule(unittest.TestCase):
np.testing.assert_allclose(c.numpy(), a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(d.numpy(), b.numpy().sum()-(a.numpy().sum()+2), atol=1e-4, rtol=1e-4)
# changed by: multireduce spec
# pattern in adam
@unittest.expectedFailure
def test_partial_fuse3(self):
Tensor.manual_seed(0)
a = Tensor.randn(16, 16).realize()
@ -1471,14 +1479,12 @@ class TestSchedule(unittest.TestCase):
e = c * d
f = b.sum() - e
# run_schedule(check_schedule([c, d, e, f], 1))
run_schedule(check_schedule([c, d, e, f], 2))
run_schedule(check_schedule([c, d, e, f], 2 if RANGEIFY else 5))
np.testing.assert_allclose(c.numpy(), c_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(d.numpy(), d_np:=a.numpy().sum()*2, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(e.numpy(), e_np:=c_np*d_np, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(f.numpy(), b.numpy().sum() - e_np, atol=1e-4, rtol=1e-4)
# changed by: multireduce spec
@unittest.expectedFailure
def test_partial_fuse4(self):
Tensor.manual_seed(0)
a = Tensor.randn(16, 16).realize()
@ -1488,7 +1494,7 @@ class TestSchedule(unittest.TestCase):
e = c * d
f = (b - d).sum() - e
# run_schedule(check_schedule([c, d, e, f], 1))
run_schedule(check_schedule([c, d, e, f], 3))
run_schedule(check_schedule([c, d, e, f], 5))
np.testing.assert_allclose(c.numpy(), c_np:=a.numpy().sum()+2, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(d.numpy(), d_np:=a.numpy().sum()*2, atol=1e-4, rtol=1e-4)
np.testing.assert_allclose(e.numpy(), e_np:=c_np*d_np, atol=1e-4, rtol=1e-4)
@ -1623,11 +1629,11 @@ class TestSchedule(unittest.TestCase):
out = x.argmax(1)
run_schedule(check_schedule(out, 2))
def test_conv2d(self): _test_conv2d(7)
def test_conv2d_fused(self): _test_conv2d(5, FUSE_CONV_BW=1)
def test_conv2d(self): _test_conv2d(4 if RANGEIFY else 7)
def test_conv2d_fused(self): _test_conv2d(4 if RANGEIFY else 5, FUSE_CONV_BW=1)
@unittest.skipUnless(is_dtype_supported(dtypes.half) and is_dtype_supported(dtypes.ulong), "need half and ulong")
def test_conv2d_half(self): _test_conv2d(7, dtype=dtypes.half)
def test_conv2d_half(self): _test_conv2d(4 if RANGEIFY else 7, dtype=dtypes.half)
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "Causes other tests to fail")
@unittest.expectedFailure
@ -1654,7 +1660,8 @@ class TestSchedule(unittest.TestCase):
constv = Tensor.empty(2, 2).uop.const_like(10).contiguous()
check_schedule(constv, 1)
@unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU")
@unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
@expect_rangeify_fails
def test_image_matmul(self):
with Context(IMAGE=2):
x = Tensor.randn((9, 9)).realize()
@ -1690,6 +1697,7 @@ class TestSchedule(unittest.TestCase):
def test_late_fusion_post_expand(self):
self._test_fusion([(32, 32)], lambda a:a-a.sum(1), 2)
@expect_rangeify_fails
def test_cast_padded_view(self):
a = Tensor.arange(4).reshape(1, 4)
casted_view = a.pad(((0, 1), (0, 0))).cast(dtypes.float)
@ -1719,6 +1727,7 @@ class TestSchedule(unittest.TestCase):
self.assertListEqual(realized_const_view.tolist(), [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]])
@given(strat.sampled_from(dtypes.all), strat.sampled_from(dtypes.all))
@expect_rangeify_fails
def test_cast_padded_const(self, dt1, dt2):
assume(is_dtype_supported(dt1) and is_dtype_supported(dt2))
a = Tensor(1, dtype=dt1).reshape(1, 1).pad(((1, 1), None))
@ -1728,53 +1737,41 @@ class TestSchedule(unittest.TestCase):
run_schedule(check_schedule(realized_const_view, 1))
np.testing.assert_equal(realized_const_view.numpy(), [[0], [1], [0]])
class TestIndexing(unittest.TestCase):
def check_schedule(self, xt:Tensor|list[Tensor], cnt:int):
with Context(FUSE_ARANGE=getenv("FUSE_ARANGE", 1)):
lst = [xt] if isinstance(xt, Tensor) else xt
s = Tensor.schedule(*lst)
lowered = [x[1] for x in lower_schedule(s.copy())]
kernels = [ei for ei in list(lowered) if isinstance(ei.prg, CompiledRunner)]
if FUSE_ARANGE and len(kernels) != cnt:
raise KernelCountException(f"{len(kernels)} != {cnt}")
for ei in lowered: ei.run(do_update_stats=True)
return s
def test_simple_indexing(self):
X = Tensor.randn(10, 10).realize()
idxs = Tensor([0, 2]).realize()
xt = X[idxs]
self.check_schedule(xt, 2)
run_schedule(check_schedule(xt, 2))
np.testing.assert_equal(xt.numpy(), X.numpy()[idxs.numpy()])
def test_simple_indexing_alt(self):
X = Tensor.arange(16).reshape(4, 4)
xt = X[[1, 2], [-1, 2]]
self.check_schedule(xt, 1)
run_schedule(check_schedule(xt, 1))
np.testing.assert_equal(xt.numpy(), (np.arange(16).reshape(4, 4))[[1, 2], [-1, 2]])
def test_advanced_indexing(self):
X = Tensor.arange(10)+1
xt = X[[0, -1]]
self.check_schedule(xt, 1)
run_schedule(check_schedule(xt, 1))
np.testing.assert_equal(xt.numpy(), (np.arange(10)+1)[[0, -1]])
def test_advanced_indexing_alt(self):
X = Tensor.arange(6).reshape(3, 2)+1
xt = X[[Tensor([2]), Tensor([1])]]
self.check_schedule(xt, 3)
run_schedule(check_schedule(xt, 3))
np.testing.assert_equal(xt.numpy(), 6)
def test_advanced_simple_indexing_combined(self):
X = Tensor.arange(16).reshape(4, 4)
xt = X[1:2, [-1, 2]]
self.check_schedule(xt, 1)
run_schedule(check_schedule(xt, 1))
def test_push_through_reshape(self):
Tensor.manual_seed(0)
x = Tensor.randn(10, 20).realize()
out = x.argmax(1)
self.check_schedule(out, 2)
run_schedule(check_schedule(out, 2))
np.testing.assert_allclose(out.numpy(), np.argmax(x.numpy(), 1))
def test_arange_push_through_expand(self):
@ -1782,35 +1779,35 @@ class TestIndexing(unittest.TestCase):
a = Tensor.arange(4,)
b = Tensor.randn(4, 4).realize()
out = (a+b).sum()
self.check_schedule(out, 1)
run_schedule(check_schedule(out, 1))
np.testing.assert_allclose(out.numpy(), (np.arange(4)+b.numpy()).sum(), atol=1e-5)
def test_argmin(self):
Tensor.manual_seed(0)
x = Tensor.randn(4, 32).realize()
out = x.argmin(-1)
self.check_schedule(out, 2)
run_schedule(check_schedule(out, 2))
np.testing.assert_equal(out.numpy(), x.numpy().argmin(axis=-1))
def test_argmax(self):
Tensor.manual_seed(0)
x = Tensor.randn(4, 32).realize()
out = x.argmax(-1)
self.check_schedule(out, 2)
run_schedule(check_schedule(out, 2))
np.testing.assert_equal(out.numpy(), x.numpy().argmax(axis=-1))
def test_arange_transposed(self):
Tensor.manual_seed(0)
x = Tensor.randint(4, 1).realize()
a = ((Tensor.arange(4,)*x).T).sum()
self.check_schedule(a, 1)
run_schedule(check_schedule(a, 1))
np.testing.assert_equal(a.numpy(), (np.arange(4)*x.numpy()).T.sum())
def test_div_padded_arange(self):
x = Tensor.full((2,2), 16)
y = x.idiv(Tensor.linspace(2, 8, steps=4, dtype=dtypes.int).reshape(2,2)).pad(((1,1), (1,1)))
out = y.sum(axis=1)
with Context(FUSE_ARANGE=1): run_schedule(check_schedule(out, 2))
run_schedule(check_schedule(out, 2))
self.assertListEqual(out.tolist(), [0, 12, 4, 0])
def test_arange_transposed_descendants(self):
@ -1819,7 +1816,7 @@ class TestIndexing(unittest.TestCase):
a = (Tensor.arange(4,)*x).T
b = Tensor.randint(4, 4).realize()
out = (a+b).sum()
self.check_schedule(out, 1)
run_schedule(check_schedule(out, 1))
np.testing.assert_equal(out.numpy(), ((np.arange(4)*x.numpy()).T+b.numpy()).sum())
def test_arange_index(self):
@ -1827,7 +1824,7 @@ class TestIndexing(unittest.TestCase):
x = Tensor.randn(5, 2).realize()
a = Tensor.arange(10)
out = (x + a[2]).sum()
self.check_schedule(out, 1)
run_schedule(check_schedule(out, 1))
np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6)
def test_arange_index_shrink(self):
@ -1836,14 +1833,14 @@ class TestIndexing(unittest.TestCase):
x = Tensor.randn(11).realize()
a = Tensor.arange(22)
out = (x + a[:11]).sum()
self.check_schedule(out, 1)
check_schedule(out, 1)
def test_arange_index_contiguous(self):
Tensor.manual_seed(0)
x = Tensor.randn(5, 2).realize()
a = Tensor.arange(10).contiguous()
out = (x + a[2]).sum()
self.check_schedule(out, 3)
run_schedule(check_schedule(out, 3))
np.testing.assert_allclose(out.numpy(), (x.numpy()+np.arange(10)[2]).sum(), atol=1e-5, rtol=1e-6)
def test_arange_index_child(self):
@ -1851,62 +1848,24 @@ class TestIndexing(unittest.TestCase):
x = Tensor.randn(5, 2).realize()
a = Tensor.arange(10)+1
out = (x + a[2]).sum()
self.check_schedule(out, 1)
run_schedule(check_schedule(out, 1))
np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6)
def test_arange_index_contiguous_child(self):
def test_user_contiguous(self):
Tensor.manual_seed(0)
x = Tensor.randn(5, 2).realize()
a = (Tensor.arange(10)+1).contiguous()
out = (x + a[2]).sum()
self.check_schedule(out, 3)
run_schedule(check_schedule(out, 3))
np.testing.assert_allclose(out.numpy(), (x.numpy()+(np.arange(10)+1)[2]).sum(), atol=1e-5, rtol=1e-6)
def test_arange_childless_base(self):
a = Tensor.arange(4)
self.check_schedule(a, 1)
np.testing.assert_equal(a.numpy(), np.arange(4))
def test_arange_childless_view(self):
a = Tensor.arange(4).reshape(2, 2)
a[0] = 4
np.testing.assert_equal(a.numpy(), [[4, 4], [2, 3]])
def test_arange_group_childless_base(self):
Tensor.manual_seed(0)
x = Tensor.randint(4).realize()
a = Tensor.arange(4)+x
self.check_schedule(a, 1)
np.testing.assert_equal(a.numpy(), np.arange(4)+x.numpy())
def test_arange_group_childless_view(self):
Tensor.manual_seed(0)
x = Tensor.ones(4).contiguous().realize()
a = Tensor.arange(4)+x
a[0] = 6
np.testing.assert_equal(a.numpy(), [6., 2., 3., 4.])
@unittest.skip("BUFFER_VIEW no longer supported on non-disk devices")
def test_arange_view_op(self):
a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).contiguous()
sched = self.check_schedule(a, 1)
sched = run_schedule(check_schedule(a, 1))
self.assertIs(sched[1].ast.op, Ops.BUFFER_VIEW)
np.testing.assert_equal(a.numpy(), [[4, 5]])
@unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device")
def test_arange_shrink_copy(self):
a = Tensor.arange(12).reshape(4, 3).shrink(((1, 2), (1, 3))).to("CPU")
sched = self.check_schedule(a, 2) # NOTE: there is a contiguous between REDUCE_AXIS and COPY
self.assertIs(sched[-1].ast.op, Ops.COPY)
np.testing.assert_equal(a.numpy(), [[4, 5]])
@unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from ext device")
def test_arange_expand_copy(self):
a = Tensor.arange(4).reshape(2, 2, 1).expand(2, 2, 2).contiguous().to("CPU")
sched = self.check_schedule(a, 2) # NOTE: there is a contiguous between REDUCE_AXIS and COPY
self.assertIs(sched[2].ast.op, Ops.COPY)
np.testing.assert_equal(a.numpy(), [[[0, 0], [1, 1]], [[2, 2], [3, 3]]])
@unittest.skipUnless(is_dtype_supported(dtypes.half), "need half")
def test_precompute_freqs_cis(self):
from extra.models.llama import precompute_freqs_cis
@ -1922,23 +1881,33 @@ class TestIndexing(unittest.TestCase):
def test_fuse_assign_contiguous(self):
x = Tensor.zeros(4, 4, dtype=dtypes.int).contiguous().realize()
a = Tensor.arange(8).reshape(4, 2)
self.check_schedule(x.shrink((None, (0, 2))).assign(a.contiguous()), 2)
run_schedule(check_schedule(x.shrink((None, (0, 2))).assign(a.contiguous()), 2))
np.testing.assert_equal(x.numpy(), [[0, 1, 0, 0], [2, 3, 0, 0], [4, 5, 0, 0], [6, 7, 0, 0]])
def test_assign_non_contiguous(self):
x = Tensor.zeros(4, 4, dtype=dtypes.int).contiguous().realize()
y = Tensor.randint(4, 2)
a = Tensor.arange(8).reshape(4, 2)+y
x.shrink((None, (0, 2))).assign(a).realize()
xref = np.zeros((4, 4), dtype=int)
xref[:, :2] = np.arange(8).reshape(4, 2)+y.numpy()
def test_assign_non_contiguous_alt(self): self.test_assign_non_contiguous(alt=True)
def test_assign_non_contiguous(self, alt=False):
x = (Tensor.arange(16)-100).reshape(4,4).contiguous().realize()
xref = x.numpy()
if alt:
y = Tensor.randint(2, 4).contiguous().realize()
a = Tensor.arange(8).reshape(2, 4)+y
tst = x.shrink(((0, 2), None)).assign(a).realize()
xref[:2, :] = np.arange(8).reshape(2, 4)+y.numpy()
else:
y = Tensor.randint(4, 2).contiguous().realize()
a = Tensor.arange(8).reshape(4, 2)+y
tst = x.shrink((None, (0, 2))).assign(a).realize()
xref[:, :2] = np.arange(8).reshape(4, 2)+y.numpy()
np.testing.assert_equal(x.numpy(), xref)
if RANGEIFY > 0:
# NOTE: this is a bug on non rangeify
np.testing.assert_equal(tst.numpy(), a.numpy())
def test_sparse_categorical_crossentropy_simple(self):
X = Tensor([[0, 2, 3], [1, 2, 3]]).realize()
Y = Tensor([1, 2]).realize()
loss = X.sparse_categorical_crossentropy(Y)
self.check_schedule(loss, 4)
run_schedule(check_schedule(loss, 4))
np.testing.assert_allclose(loss.item(), 0.878309, atol=1e-5, rtol=1e-6)
@unittest.skipIf(Device.DEFAULT == "WEBGPU", "Validation error on WebGPU")
@ -1950,28 +1919,21 @@ class TestIndexing(unittest.TestCase):
yt = Tensor.randn(BS, 10).realize()
with Context(SPLIT_REDUCEOP=0):
loss = yt.sparse_categorical_crossentropy(Y_train[samples])
self.check_schedule(loss, 6)
run_schedule(check_schedule(loss, 6))
loss_fused = loss.numpy()
loss_ref = torch.nn.CrossEntropyLoss()(torch.tensor(yt.numpy()), torch.tensor(Y_train.numpy())[torch.tensor(samples.numpy())])
np.testing.assert_allclose(loss_fused, loss_ref.numpy(), atol=1e-6, rtol=1e-6)
@unittest.expectedFailure
def test_arange_fuse_grouped_children(self):
X = Tensor.randn(4, 4).realize()
r = (X+Tensor.arange(16).reshape(4, 4)).sum()
out0 = r+2
out1 = r+3
self.check_schedule([out0, out1], 1)
run_schedule(check_schedule([out0, out1], 1 if RANGEIFY else 3))
r_ref = (X.numpy()+np.arange(16).reshape(4, 4)).sum()
np.testing.assert_allclose(out0.numpy(), r_ref+2, rtol=2e-7)
np.testing.assert_allclose(out1.numpy(), r_ref+3, rtol=2e-7)
def test_dont_fold_arange_contiguous_view(self):
X = Tensor.randn(4, 4).realize()
r = (X+Tensor.arange(16).reshape(4, 4).contiguous()).sum(1, keepdim=True)
self.check_schedule([r], 2)
np.testing.assert_allclose(r.numpy(), (X.numpy()+np.arange(16).reshape(4, 4)).sum(1, keepdims=True), atol=1e-5, rtol=1e-6)
@unittest.skip("multi output isn't supported")
def test_multiview_arange_children(self):
X = Tensor.randn(2,3,4,4).numpy()
@ -2101,6 +2063,7 @@ class TestView(unittest.TestCase):
run_schedule(sched)
np.testing.assert_equal(b.numpy(), 0)
@expect_rangeify_fails
def test_mask_dim_1(self):
# mask out dim = 1 works too
a = Tensor.rand(10, 10).realize()
@ -2127,6 +2090,7 @@ class TestView(unittest.TestCase):
# a*VIEW(x), where VIEW(x) = 0
# x collapses along with its children
@unittest.skipIf(RANGEIFY, "this only fails if you run all of TestSchedule, some global tensor map bug?")
def test_parent_view_collapses(self):
a = Tensor([1, 2])
b = Tensor.arange(3).contiguous()
@ -2208,84 +2172,6 @@ class TestSimplifier(unittest.TestCase):
assert UPat(Ops.CONST, arg=False).match(sink, {}), f"expected {sink} to collapse to a const False"
assert sink.shape == a.shape
tensor_const_pm = PatternMatcher([
(UPat(Ops.CONST, src=(UPat(Ops.VIEW, src=(UPat(Ops.DEVICE),)),)), lambda: True),
(UPat(Ops.BIND, src=(UPat(Ops.DEFINE_VAR, src=(UPat(Ops.VIEW, src=(UPat(Ops.DEVICE),)))), UPat(Ops.CONST))), lambda: True),
])
class TestConst(unittest.TestCase):
# ** part 1: basic functionality of a tensor directly created from CONST
def test_tensor_const(self):
a = Tensor(1)
print(a.uop)
self.assertTrue(tensor_const_pm.rewrite(a.uop))
def test_tensor_variable(self):
vv = UOp.variable("a", 0, 10).bind(1)
a = Tensor(vv)
print(a.uop)
self.assertTrue(tensor_const_pm.rewrite(a.uop))
def test_const_schedule(self):
a = Tensor.ones((4, 4))
sched = a.schedule()
self.assertEqual(len(sched), 0)
def test_const_contiguous_schedule(self):
# this ends up in the big graph
a = Tensor.ones((4,)).contiguous()
sched = a.schedule()
self.assertEqual(len(sched), 1)
# ** part 2: scheduler behavior when const folding happens later
def test_const_folding_no_realize(self):
a = Tensor([1, 2, 3, 4])*0
sched = a.schedule()
self.assertEqual(len(sched), 0)
def test_src_const_folding(self):
with Context(TRACK_MATCH_STATS=0):
a = Tensor.full((4,), 1).contiguous().realize()
b = Tensor.full((4,), 2).contiguous().realize()
mul0 = a*0
add = b+mul0
sched = add.schedule()
self.assertEqual(len(sched), 0)
# b+0 and b share the same underlying device memory
self.assertIs(add.uop.buffer, b.uop.buffer)
self.assertListEqual(add.tolist(), [2, 2, 2, 2])
def test_src_masked_const_folding(self):
with Context(TRACK_MATCH_STATS=0):
a = Tensor.full((4,), 1).contiguous().realize()
b = Tensor.full((6,), 2).contiguous().realize()
mul0 = a*0
add = b+mul0.pad((1, 1), value=2)
sched = add.schedule()
self.assertEqual(len(sched), 1)
run_schedule(sched)
# add gets assigned to a new buffer
self.assertIsNot(add.uop.base.realized, b.uop.base.realized)
self.assertListEqual(add.tolist(), [4, 2, 2, 2, 2, 4])
# ** part 3: Tensor variable bindings
#@unittest.expectedFailure # TODO: should schedule assert if you try to realize a Variable?
def test_var_schedule(self):
vv = UOp.variable("a", 0, 10).bind(1)
a = Tensor(vv)
sched = a.schedule()
self.assertEqual(len(sched), 0)
def test_add_tvar(self):
vv = UOp.variable("a", 0, 10).bind(1)
a = Tensor(vv)+2
sched, var_vals = a.schedule_with_vars()
self.assertEqual(len(sched), 1)
run_schedule(sched, var_vals)
self.assertEqual(a.tolist(), 3)
@unittest.skipIf(Device.DEFAULT == "CPU", "tests copy from another device to cpu")
class TestCopyFolding(unittest.TestCase):
def test_const_copy_is_free(self):
@ -2299,6 +2185,7 @@ class TestCopyFolding(unittest.TestCase):
b = (a*zeros).to("CPU")
run_schedule(check_schedule(b, 0, filter_sink=False))
self.assertListEqual(b.tolist(), [0, 0, 0])
self.assertEqual(b.device, "CPU")
def test_alu_after_copy(self):
a = Tensor.ones((4,)).to("CPU")
@ -2307,6 +2194,12 @@ class TestCopyFolding(unittest.TestCase):
add.kernelize()
assert all_same([x.device for x in add.uop.src]), f"ALU has different devices! {[x.device for x in add.src]}"
def test_alu_before_copy(self):
buf = Tensor.ones(1).contiguous().realize()
a = buf+1
b = a.to("CPU")
self.assertListEqual(b.tolist(), [2.])
def test_copy_to_same_device(self):
a = Tensor.empty(4).uop
b = a.copy_to_device(a.device)
@ -2323,6 +2216,15 @@ class TestCopyFolding(unittest.TestCase):
b = schedule_graph_rewrite(b)
self.assertIs(b.base, a.base)
def test_copy_to_same_device_sched(self):
a = Tensor.ones(4).contiguous().realize().uop.as_buf()
t = Tensor(a.copy_to_device(a.device))
sched = t.schedule()
assert len([s for s in sched if s.ast.op is Ops.COPY]) == 0
run_schedule(sched)
assert t.uop.is_realized, f"didn't realize Tensor {t}"
self.assertListEqual(t.tolist(), [1.,1.,1.,1.])
def test_clone(self):
a = Tensor.empty(4)
check_schedule(a.clone(), 1, filter_sink=False)
@ -2353,6 +2255,7 @@ class TestCopyFolding(unittest.TestCase):
b.realize()
self.assertListEqual(b.tolist(), [[0, 2], [1, 3]])
@expect_rangeify_fails
def test_permute_on_disk(self):
with open(temp('dt_arange_4_permute'), "wb") as f: f.write(Tensor.arange(4).realize().uop.base.buffer.as_buffer())
a = Tensor.empty(4, dtype=dtypes.int32, device=f"disk:{temp('dt_arange_4_permute')}")
@ -2499,6 +2402,7 @@ class TestUOpBecome(unittest.TestCase):
self.assertEqual(add.uop.shape, (8, 2))
assert add.uop is not add.uop.base
@expect_rangeify_fails
def test_new_flat_buffer(self):
a = Tensor.empty(4,)
b = Tensor.empty(4,)
@ -2524,6 +2428,7 @@ class TestUOpBecome(unittest.TestCase):
z = (img*x) / y
check_schedule(z, 1)
@expect_rangeify_fails
def test_become_existing_buffer(self):
a = Tensor.empty(4, 4)
b = a*1
@ -2551,6 +2456,7 @@ class TestUOpBecome(unittest.TestCase):
check_schedule(b, 0)
assert UPat(Ops.CONST, arg=0).match(b.uop.base, {}) # scheduling replaces the tensor uop with a VIEW(BUFFER)
@expect_rangeify_fails
def test_become_const_in_view(self):
# if we shrink the base down to a size 0, only the VIEW becomes CONST, base is unchanged.
add = Tensor.empty(2, 2)+Tensor.empty(2, 2)
@ -2568,6 +2474,7 @@ class TestUOpBecome(unittest.TestCase):
assert UPat(Ops.CONST, arg=3).match(const_add.uop.base, {})
# tensors can become another realized tensor source
@expect_rangeify_fails
def test_become_existing_buf_simple(self):
a = Tensor.empty(4, 4)
b = a+0
@ -2576,12 +2483,14 @@ class TestUOpBecome(unittest.TestCase):
self.assertIs(a.uop, b.uop)
# they can also chain other movement ops on top of the tensor source
@expect_rangeify_fails
def test_become_existing_buf_view(self):
a = Tensor.empty(4, 4)
b = a.permute((1, 0))+0
check_schedule(b, 0)
self.assertEqual(b.uop.st, a.uop.permute((1, 0)).st)
@expect_rangeify_fails
def test_become_existing_buf_view_alt(self):
a = Tensor.empty(4, 4)
b = a.permute((1, 0)).reshape((8, 2))+0
@ -2589,6 +2498,7 @@ class TestUOpBecome(unittest.TestCase):
self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)
# they can also have other base parents that simplified, in that case we just backtrack to the chained mops
@expect_rangeify_fails
def test_become_existing_buf_complex(self):
a = Tensor.empty(4, 4)
b = (a.permute((1, 0))+0).reshape((8, 2))+0
@ -2596,6 +2506,7 @@ class TestUOpBecome(unittest.TestCase):
self.assertEqual(b.uop.st, a.uop.permute((1, 0)).reshape((8, 2)).st)
assert b.uop.base.op is Ops.BUFFER
@expect_rangeify_fails
def test_become_multiple_choices(self):
a = Tensor.empty(16)
b = (a.reshape(1, 1, 4, 1, 4)+0).reshape(1, 1, 4, 4).shrink(((0, 1), (0, 1), (0, 3), (0, 3)))+0
@ -2607,6 +2518,7 @@ class TestUOpBecome(unittest.TestCase):
assert b.uop is c.uop
assert UPat(Ops.VIEW, src=(UPat(Ops.BUFFER),)).match(c.uop, {})
@expect_rangeify_fails
def test_setitem_becomes_subbuffer(self):
a = Tensor.full((4,), 2.).contiguous().realize()
b = a.shrink(((0, 2),)).assign(Tensor.full((2,), 1.0))

View file

@ -1,4 +1,6 @@
import unittest
import random
from os import getenv
from tinygrad import Tensor, TinyJit, Variable, dtypes
from tinygrad.helpers import Context
import numpy as np
@ -165,6 +167,41 @@ class TestSetitem(unittest.TestCase):
t[idx] = val
self.assertEqual(t.tolist(), [val]*idx_size+[idx_size])
def test_setitem_advanced_indexing(self):
# Example from https://numpy.org/doc/stable/user/basics.indexing.html#combining-advanced-and-basic-indexing
t = Tensor.zeros(10,20,30,40,50).contiguous()
ind_1 = Tensor([5,3,7,8])
ind_2 = Tensor([[[0],[1],[2]],[[3],[4],[5]]])
v = Tensor.arange(2*3*4*10*30*50).reshape(2,3,4,10,30,50)
t[:, ind_1, :, ind_2, :] = v
n = np.zeros((10,20,30,40,50))
n[:, ind_1.numpy(), :, ind_2.numpy(), :] = v.numpy()
np.testing.assert_allclose(t.numpy(), n)
def test_setitem_2d_tensor_indexing(self):
t = Tensor.zeros(2).contiguous()
index = Tensor([[0, 1], [1,0]])
v = Tensor.arange(2*2).reshape(2, 2).contiguous()
t[index] = v
n = np.zeros((2,))
n[index.numpy()] = v.numpy()
np.testing.assert_allclose(t.numpy(), n)
@unittest.skip("slow")
def test_setitem_tensor_indexing_fuzz(self):
random.seed(getenv("SEED", 42))
for _ in range(getenv("ITERS", 100)):
size = random.randint(5, 10)
d0, d1, d2 = random.randint(1,5), random.randint(1,5), random.randint(1,5)
t = Tensor.zeros(size).contiguous()
n = np.zeros((size,))
index = Tensor.randint((d0, d1, d2), low=0, high=size)
v = Tensor.arange(d0*d1*d2).reshape(d0, d1, d2)
t[index] = v
n[index.numpy()] = v.numpy()
np.testing.assert_allclose(t.numpy(), n, err_msg=f"failed with index={index.numpy().tolist()} and v={v.numpy().tolist()}")
class TestWithGrad(unittest.TestCase):
def test_no_requires_grad_works(self):
z = Tensor.rand(8, 8)

View file

@ -2,6 +2,7 @@ import unittest
from test.helpers import assert_jit_cache_len
from tinygrad import Variable, Tensor, TinyJit
from tinygrad.helpers import RANGEIFY
import numpy as np
class TestSymbolicJit(unittest.TestCase):
@ -11,11 +12,23 @@ class TestSymbolicJit(unittest.TestCase):
a = Tensor.rand(3, 10)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = jf(a[:, :vi]).reshape(3, i).numpy()
symbolic = jf(a[:, :vi])[:3, :i].numpy()
expected = f(a[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
def test_plus1_pad(self):
# TODO: without contiguous, the pad is not captured in jit
def f(a): return (a+1).pad((None, (0, 10-a.shape[1]))).contiguous().realize()
jf = TinyJit(f)
a = Tensor.rand(3, 10)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = jf(a[:, :vi]).numpy()
expected = f(a[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1 if RANGEIFY else 2) # one add and one pad, can be one kernel?
def test_add(self):
def f(a, b): return (a+b).realize()
jf = TinyJit(f)
@ -23,7 +36,8 @@ class TestSymbolicJit(unittest.TestCase):
b = Tensor.rand(3, 10)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = jf(a[:, :vi], b[:, :vi]).reshape(3, i).numpy()
symbolic = jf(a[:, :vi], b[:, :vi])
symbolic = symbolic[:3, :i].numpy()
expected = f(a[:, :i], b[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -63,10 +77,10 @@ class TestSymbolicJit(unittest.TestCase):
v = Tensor.rand(2, 10, 4, 8)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = jf(q, k[:, :vi], v[:, :vi]).reshape(2, 4, 1, 8).numpy()
symbolic = jf(q, k[:, :vi], v[:, :vi])[:2, :4, :1, :8].numpy()
expected = f(q, k[:, :i], v[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 5)
assert_jit_cache_len(jf, 4 if RANGEIFY else 5)
def test_cat_dim0(self):
def f(a, b): return a.cat(b, dim=0).realize()
@ -75,7 +89,7 @@ class TestSymbolicJit(unittest.TestCase):
b = Tensor.rand(2, 3)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = jf(a[:vi], b).reshape(i+2, 3).numpy()
symbolic = jf(a[:vi], b)[:i+2, :3].numpy()
expected = f(a[:i], b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -87,7 +101,7 @@ class TestSymbolicJit(unittest.TestCase):
b = Tensor.rand(3, 2)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = jf(a[:, :vi], b).reshape(3, i+2).numpy()
symbolic = jf(a[:, :vi], b)[:3, :i+2].numpy()
expected = f(a[:, :i], b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -101,7 +115,7 @@ class TestSymbolicJit(unittest.TestCase):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = jf(a[:vi], b[:vj]).reshape(i+j, 3).numpy()
symbolic = jf(a[:vi], b[:vj])[:i+j, :3].numpy()
expected = f(a[:i], b[:j]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -115,7 +129,7 @@ class TestSymbolicJit(unittest.TestCase):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = jf(a[:, :vi], b[:, :vj]).reshape(3, i+j).numpy()
symbolic = jf(a[:, :vi], b[:, :vj])[:3, :i+j].numpy()
expected = f(a[:, :i], b[:, :j]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -129,7 +143,7 @@ class TestSymbolicJit(unittest.TestCase):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = jf(a[:vi, :], b[:, :vj]).reshape(i, j).numpy()
symbolic = jf(a[:vi, :], b[:, :vj])[:i, :j].numpy()
expected = f(a[:i, :], b[:, :j]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -143,7 +157,7 @@ class TestSymbolicJit(unittest.TestCase):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = jf(a[:vj, :], b[:, :vi]).reshape(j, i).numpy()
symbolic = jf(a[:vj, :], b[:, :vi])[:j, :i].numpy()
expected = f(a[:j, :], b[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -195,8 +209,8 @@ class TestSymbolicJit(unittest.TestCase):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.ones(vi, 11).contiguous()
symbolic = a[:, 1:2]
symbolic = jf(symbolic).reshape(i, 1).numpy()
expected = f(a.reshape(i, 11)[:, 1:2]).numpy()
symbolic = jf(symbolic)[:i, :1].numpy()
expected = f(a[:i, :][:, 1:2]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
assert_jit_cache_len(jf, 1)
@ -231,7 +245,7 @@ class TestSymbolicJit(unittest.TestCase):
expected = b[:i].mean(0).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
# axis = 1
symbolic = jf1(c[:vi]).reshape(i).numpy()
symbolic = jf1(c[:vi])[:i].numpy()
expected = c[:i].mean(1).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -254,11 +268,11 @@ class TestSymbolicJit(unittest.TestCase):
expected = a[:i, :j].mean().numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
# axis = 0
symbolic = jf0(b[:vi, :vj]).reshape(j).numpy()
symbolic = jf0(b[:vi, :vj])[:j].numpy()
expected = b[:i, :j].mean(0).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
# axis = 1
symbolic = jf1(c[:vi, :vj]).reshape(i).numpy()
symbolic = jf1(c[:vi, :vj])[:i].numpy()
expected = c[:i, :j].mean(1).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -283,7 +297,7 @@ class TestSymbolicJit(unittest.TestCase):
expected = b[:i].var(0).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
# axis = 1
symbolic = jf1(c[:vi]).reshape(i).numpy()
symbolic = jf1(c[:vi])[:i].numpy()
expected = c[:i].var(1).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -306,11 +320,11 @@ class TestSymbolicJit(unittest.TestCase):
expected = a[:i, :j].var().numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
# axis = 0
symbolic = jf0(b[:vi, :vj]).reshape(j).numpy()
symbolic = jf0(b[:vi, :vj])[:j].numpy()
expected = b[:i, :j].var(0).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
# axis = 1
symbolic = jf1(c[:vi, :vj]).reshape(i).numpy()
symbolic = jf1(c[:vi, :vj])[:i].numpy()
expected = c[:i, :j].var(1).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)

View file

@ -13,7 +13,16 @@ class TestSymbolicOps(unittest.TestCase):
a = Tensor.rand(3, 10)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = f(a[:, :vi]).reshape(3, i).numpy()
symbolic = f(a[:, :vi])[:3, :i].numpy()
expected = f(a[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_plus1_pad(self):
def f(a): return (a+1).pad((None, (0, 10-a.shape[1]))).realize()
a = Tensor.rand(3, 10)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = f(a[:, :vi]).numpy()
expected = f(a[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -23,7 +32,7 @@ class TestSymbolicOps(unittest.TestCase):
b = Tensor.rand(3, 10)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = f(a[:, :vi], b[:, :vi]).reshape(3, i).numpy()
symbolic = f(a[:, :vi], b[:, :vi])[:, :i].numpy()
expected = f(a[:, :i], b[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -46,7 +55,7 @@ class TestSymbolicOps(unittest.TestCase):
vi = Variable("i", 1, 10).bind(i) if use_symbolic else i
Tensor.realize(q, k, v)
GlobalCounters.reset()
symbolic = f(q, k[:, :vi, :, :], v[:, :vi, :, :]).reshape(2, 4, 1, 8).numpy()
symbolic = f(q, k[:, :vi, :, :], v[:, :vi, :, :])[:2, :4, :1, :8].numpy()
expected = f(q, k[:, :i, :, :], v[:, :i, :, :]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -85,7 +94,7 @@ class TestSymbolicOps(unittest.TestCase):
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
b = Tensor.rand(2, 3)
symbolic = f(a[:vi, :], b).reshape(i+2, 3).numpy()
symbolic = f(a[:vi, :], b)[:i+2, :3].numpy()
expected = f(a[:i, :], b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -95,7 +104,7 @@ class TestSymbolicOps(unittest.TestCase):
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
b = Tensor.rand(3, 2)
symbolic = f(a[:, :vi], b).reshape(3, i+2).numpy()
symbolic = f(a[:, :vi], b)[:3, :i+2].numpy()
expected = f(a[:, :i], b).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -107,7 +116,7 @@ class TestSymbolicOps(unittest.TestCase):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = f(a[:vi, :], b[:vj, :]).reshape(i+j, 3).numpy()
symbolic = f(a[:vi, :], b[:vj, :])[:i+j, :3].numpy()
expected = f(a[:i, :], b[:j, :]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -119,50 +128,41 @@ class TestSymbolicOps(unittest.TestCase):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = f(a[:, :vi], b[:, :vj]).reshape(3, i+j).numpy()
symbolic = f(a[:, :vi], b[:, :vj])[:3, :i+j].numpy()
expected = f(a[:, :i], b[:, :j]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_two_vars_plus1_ij(self):
def f(a, b): return (a@b+1).realize()
a = Tensor.rand(10, 3)
b = Tensor.rand(3, 10)
a = Tensor.rand(10, 3).realize()
b = Tensor.rand(3, 10).realize()
for i in range(2, 5):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = f(a[:vi, :], b[:, :vj]).reshape(i, j).numpy()
symbolic = f(a[:vi, :], b[:, :vj])[:i, :j].numpy()
expected = f(a[:i, :], b[:, :j]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_two_vars_plus1_ji(self):
# reverse the order of variables
def f(a, b): return (a@b+1).realize()
a = Tensor.rand(10, 3)
b = Tensor.rand(3, 10)
a = Tensor.rand(10, 3).realize()
b = Tensor.rand(3, 10).realize()
for i in range(2, 5):
for j in range(2, 5):
vi = Variable("i", 1, 10).bind(i)
vj = Variable("j", 1, 10).bind(j)
symbolic = f(a[:vj, :], b[:, :vi]).reshape(j, i).numpy()
symbolic = f(a[:vj, :], b[:, :vi])[:j, :i].numpy()
expected = f(a[:j, :], b[:, :i]).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_reshape_from_symbolic(self):
a = Tensor.rand(30)
for i in range(3, 5):
vi = Variable("i", 3, 10).bind(i)
symbolic = a[:vi*3].reshape((3, 3)).numpy()
# To match symbolic reshape (potential implicit shrink), we need a shrink
expected = a[:i*3].shrink(((0, 9),)).reshape((3, 3)).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_invalid_symbolic_reshape(self):
a = Tensor.rand(30)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
# Cannot reshape into symbolic from non-symbolic
with self.assertRaises(AssertionError): a.reshape((3, vi))
with self.assertRaises(ValueError): a.reshape((3, vi))
def test_shrink(self):
for i in range(1, 5):
@ -178,6 +178,7 @@ class TestSymbolicOps(unittest.TestCase):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.rand(7, 11)
symbolic = a[3:5, vi:vi+2]
print(symbolic.shape)
symbolic = symbolic.numpy()
expected = a[3:5, i:i+2].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -186,7 +187,7 @@ class TestSymbolicOps(unittest.TestCase):
a = Tensor.rand(7, 11)
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
symbolic = a[3:5, :vi:1].reshape(2, i).numpy()
symbolic = a[3:5, :vi:1][:2, :i].numpy()
expected = a[3:5, :i:1].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -194,7 +195,7 @@ class TestSymbolicOps(unittest.TestCase):
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor(1).unsqueeze(0).pad((0, 1)).unsqueeze(0)
symbolic = a.expand(vi, 2).reshape(i, 2).numpy()
symbolic = a.expand(vi, 2)[:i, :2].numpy()
expected = a.expand(i, 2).numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
@ -202,8 +203,8 @@ class TestSymbolicOps(unittest.TestCase):
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
a = Tensor.ones(vi, 11).contiguous()
symbolic = a[:, 1:2].reshape(i, 1).numpy()
expected = a.reshape(i, 11)[:, 1:2].numpy()
symbolic = a[:, 1:2][:i, :1].numpy()
expected = Tensor.ones(i, 11)[:, 1:2].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_ones_sum(self):
@ -220,7 +221,11 @@ class TestSymbolicOps(unittest.TestCase):
vi = Variable("i", 1, 10).bind(i)
for axis in [None, 0, 1]:
expected = a[:i].mean(axis).numpy()
symbolic = a[:vi].mean(axis).reshape(expected.shape).numpy()
symbolic = a[:vi].mean(axis)
if axis is None:
symbolic = symbolic.numpy()
else:
symbolic = symbolic[:expected.shape[0]].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_mean_2d(self):
@ -231,7 +236,11 @@ class TestSymbolicOps(unittest.TestCase):
vj = Variable("j", 1, 10).bind(j)
for axis in [None, 0, 1]:
expected = a[:i, :j].mean(axis).numpy()
symbolic = a[:vi, :vj].mean(axis).reshape(expected.shape).numpy()
symbolic = a[:vi, :vj].mean(axis)
if axis is None:
symbolic = symbolic.numpy()
else:
symbolic = symbolic[:expected.shape[0]].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_var(self):
@ -240,7 +249,11 @@ class TestSymbolicOps(unittest.TestCase):
vi = Variable("i", 1, 10).bind(i)
for axis in [None, 0, 1]:
expected = a[:i].var(axis).numpy()
symbolic = a[:vi].var(axis).reshape(expected.shape).numpy()
symbolic = a[:vi].var(axis)
if axis is None:
symbolic = symbolic.numpy()
else:
symbolic = symbolic[:expected.shape[0]].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_var_2d(self):
@ -251,7 +264,11 @@ class TestSymbolicOps(unittest.TestCase):
vj = Variable("j", 1, 10).bind(j)
for axis in [None, 0, 1]:
expected = a[:i, :j].var(axis).numpy()
symbolic = a[:vi, :vj].var(axis).reshape(expected.shape).numpy()
symbolic_result = a[:vi, :vj].var(axis)
if axis is None:
symbolic = symbolic_result.numpy()
else:
symbolic = symbolic_result[:expected.shape[0]].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=1e-6)
def test_bitcast_down(self):
@ -259,7 +276,11 @@ class TestSymbolicOps(unittest.TestCase):
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
expected = a[:i].bitcast(dtypes.uint8).numpy()
symbolic = a[:vi].bitcast(dtypes.uint8).reshape(expected.shape).numpy()
symbolic_result = a[:vi].bitcast(dtypes.uint8)
if len(expected.shape) == 2:
symbolic = symbolic_result[:expected.shape[0], :expected.shape[1]].numpy()
else:
symbolic = symbolic_result[:].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=0)
@unittest.skipUnless(is_dtype_supported(dtypes.uint64), "no uint64")
@ -268,7 +289,11 @@ class TestSymbolicOps(unittest.TestCase):
for i in range(1, 5):
vi = Variable("i", 1, 10).bind(i)
expected = a[:i].bitcast(dtypes.uint64).numpy()
symbolic = a[:vi].bitcast(dtypes.uint64).reshape(expected.shape).numpy()
symbolic_result = a[:vi].bitcast(dtypes.uint64)
if len(expected.shape) == 2:
symbolic = symbolic_result[:expected.shape[0], :expected.shape[1]].numpy()
else:
symbolic = symbolic_result[:].numpy()
np.testing.assert_allclose(symbolic, expected, atol=1e-6, rtol=0)
@unittest.expectedFailure

View file

@ -4,12 +4,12 @@ import torch
import unittest, copy, mmap, random, math, array
from tinygrad import Tensor, Device, dtypes
from tinygrad.tensor import _METADATA
from tinygrad.helpers import getenv, temp, mv_address
from tinygrad.helpers import getenv, temp, mv_address, RANGEIFY
from extra.gradcheck import numerical_jacobian, jacobian, gradcheck
from hypothesis import given, settings, strategies as strat
from tinygrad.device import is_dtype_supported
from tinygrad.uop.ops import Ops, UOp
from tinygrad.runtime.support.compiler_cuda import PTX
from tinygrad.renderer.ptx import PTXRenderer
from tinygrad.codegen import full_rewrite
from tinygrad.dtype import DType
@ -516,10 +516,6 @@ class TestTinygrad(unittest.TestCase):
print(c)
def test_env_overwrite_default_device(self):
subprocess.run(['DISK=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT != \\"DISK\\""'],
shell=True, check=True)
subprocess.run(['NPY=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT != \\"NPY\\""'],
shell=True, check=True)
subprocess.run([f'{Device.DEFAULT}=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == \\"{Device.DEFAULT}\\""'],
shell=True, check=True)
subprocess.run([f'DISK=1 {Device.DEFAULT}=1 python3 -c "from tinygrad import Device; assert Device.DEFAULT == \\"{Device.DEFAULT}\\""'],
@ -554,6 +550,11 @@ class TestTinygrad(unittest.TestCase):
def test_shrink(self):
t = Tensor.arange(32).contiguous().realize()
self.assertListEqual(t[16:20].tolist(), [16,17,18,19])
self.assertListEqual(t.shrink_to(16).tolist(), list(range(16)))
t = t.reshape(4, 8).contiguous().realize()
self.assertListEqual(t.shrink_to(2, 2).tolist(), [[0, 1], [8, 9]])
with self.assertRaises(ValueError): t.shrink_to(2)
with self.assertRaises(ValueError): t.shrink_to(2, 2, 2)
@unittest.skip("this test is just flaky, sync issue")
class TestMoveTensor(unittest.TestCase):
@ -648,17 +649,22 @@ class TestZeroShapeTensor(unittest.TestCase):
def test_pad(self):
t = Tensor.rand(3, 2, 0).pad((None, None, (1, 1)), value=1)
assert t.shape == (3, 2, 2)
self.assertEqual(t.shape, (3, 2, 2))
np.testing.assert_equal(t.numpy(), np.ones((3, 2, 2)))
t = Tensor.rand(3, 2, 0).pad((None, (1, 1), None), value=1)
assert t.shape == (3, 4, 0)
self.assertEqual(t.shape, (3, 4, 0))
np.testing.assert_equal(t.numpy(), np.ones((3, 4, 0)))
t = Tensor.rand(3, 2, 0).pad(((1, 1), None, None), value=1)
assert t.shape == (5, 2, 0)
self.assertEqual(t.shape, (5, 2, 0))
np.testing.assert_equal(t.numpy(), np.ones((5, 2, 0)))
np.testing.assert_equal(Tensor([1, 2]).pad_to(4).numpy(), [1, 2, 0, 0])
np.testing.assert_equal(Tensor([[1, 2]]).pad_to(2, 3).numpy(), [[1, 2, 0], [0, 0, 0]])
with self.assertRaises(TypeError): Tensor([1, 2]).pad_to(2, 3)
with self.assertRaises(TypeError): Tensor([[1, 2]]).pad_to(3)
def test_shrink_into_zero(self):
t = Tensor.rand(3, 4).realize()
assert t.shrink((None, (2, 2))).realize().shape == (3, 0)
@ -865,11 +871,18 @@ class TestTensorMetadata(unittest.TestCase):
self.assertEqual(y.grad.uop.metadata[0].name, "sigmoid")
self.assertTrue(y.grad.uop.metadata[0].backward)
si = Tensor.schedule(out, x.grad, y.grad)[-1]
self.assertEqual(len(si.metadata), 4, f"failed with {si.metadata}")
self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "__mul__", "relu"})
bw = [m for m in si.metadata if m.backward]
self.assertEqual(len(bw), 2)
self.assertEqual(bw[0].name, "sigmoid")
if not RANGEIFY:
self.assertEqual(len(si.metadata), 4, f"failed with {si.metadata}")
self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "__mul__", "relu"})
bw = [m for m in si.metadata if m.backward]
self.assertEqual(len(bw), 2)
self.assertEqual(bw[0].name, "sigmoid")
else:
self.assertEqual(len(si.metadata), 3, f"failed with {si.metadata}")
self.assertSetEqual(set(m.name for m in si.metadata), {"sigmoid", "relu"})
bw = [m for m in si.metadata if m.backward]
self.assertEqual(len(bw), 1)
self.assertEqual(bw[0].name, "sigmoid")
class TestIdxUpcast(unittest.TestCase):
def _find_op(self, ast: UOp, op: Ops):
@ -915,13 +928,17 @@ class TestIdxUpcast(unittest.TestCase):
def test_regular_sym(self):
self.do_op_then_assert(dtypes.int, 2048, 2048, UOp.variable("dim3", 1, 64).bind(32))
@unittest.skipIf(PTX, "PTX always convert Ops.INDEX to int64")
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "PTX always convert Ops.INDEX to int64")
def test_symfold(self):
# This would cause an overflow, but after sym fold it's within int32
a = Tensor.arange(65535)
uops = self._schedule_render(a)
assert all(uop.dtype is not dtypes.long for uop in uops)
def test_arange_raise_overflow(self):
with self.assertRaises(ValueError):
self._schedule_render(Tensor.arange(2**33, dtype=dtypes.int))
@unittest.skipIf(is_dtype_supported(dtypes.long), "int64 is supported")
def test_int64_unsupported_overflow_sym(self):
with self.assertRaises(KeyError):

View file

@ -38,7 +38,7 @@ class TestTensorVariable(unittest.TestCase):
vv = Variable("a", 1, 10).bind(2)
vv2 = Variable("b", 1, 10).bind(2)
t = Tensor.ones(10, 10).contiguous()[:vv2, :vv]
ret = t.mean(axis=1).reshape(2, 1).numpy()
ret = t.mean(axis=1)[:2].reshape(2, 1).numpy()
assert np.all(ret == 1)
def test_symbolic_mean_2d_add(self):
@ -66,25 +66,25 @@ class TestTensorVariable(unittest.TestCase):
def test_symbolic_arange(self):
vv = Variable("a", 1, 10)
ret = Tensor.arange(0, vv.bind(4))
self.assertListEqual(ret.reshape(4).tolist(), [0,1,2,3])
self.assertListEqual(ret[:4].tolist(), [0,1,2,3])
def test_symbolic_arange_sym_start(self):
vv = Variable("a", 1, 6)
ret = Tensor.arange(vv.bind(4), 7)
self.assertListEqual(ret.reshape(3).tolist(), [4,5,6])
self.assertListEqual(ret[:3].tolist(), [4,5,6])
# TODO: add vmin/vmax pattern for symbolic denominator
@unittest.expectedFailure
def test_symbolic_arange_sym_step(self):
vv = Variable("step", 1, 3)
ret = Tensor.arange(0, 10, vv.bind(2))
self.assertListEqual(ret.reshape(5).tolist(), [0,2,4,6,8])
self.assertListEqual(ret[:5].tolist(), [0,2,4,6,8])
def test_symbolic_arange_two_vars(self):
begin = Variable("b", 1, 5)
end = Variable("e", 6, 10)
ret = Tensor.arange(begin.bind(4), end.bind(7))
self.assertListEqual(ret.reshape(3).tolist(), [4,5,6])
self.assertListEqual(ret[:3].tolist(), [4,5,6])
def test_variable_empty(self):
v = Variable("i", 1, 10)

View file

@ -7,6 +7,14 @@ class TestTiny(unittest.TestCase):
# *** basic functionality ***
def test_const(self):
const = Tensor(2.0)
self.assertEqual(const.item(), 2.0)
def test_copy(self):
out = Tensor([1.,2,3])
self.assertListEqual(out.tolist(), [1.0, 2.0, 3.0])
def test_plus(self):
out = Tensor([1.,2,3]) + Tensor([4.,5,6])
self.assertListEqual(out.tolist(), [5.0, 7.0, 9.0])
@ -87,7 +95,7 @@ class TestTiny(unittest.TestCase):
ones = Tensor.ones(10).contiguous()
for s in [2,5]:
ret = ones[:i.bind(s)] + 1
self.assertListEqual(ret.contiguous().reshape(s).tolist(), [2.0]*s)
self.assertListEqual(ret.contiguous()[:s].tolist(), [2.0]*s)
def test_symbolic_reduce(self):
i = Variable('i', 1, 10)
@ -137,7 +145,7 @@ class TestTiny(unittest.TestCase):
# *** image ***
@unittest.skipIf(Device.DEFAULT != "GPU", "image only supported on GPU")
@unittest.skipIf(Device.DEFAULT != "CL", "image only supported on CL")
def test_image(self):
with Context(IMAGE=2): self.test_gemm(N=4, out_dtype=dtypes.imagef((4, 1, 4)))

View file

@ -417,13 +417,45 @@ class TestUOpGraph(unittest.TestCase):
uops = to_uops_list([v.bitcast(dt)])
self.assertEqual(len([x for x in uops if x.op is Ops.BITCAST]), 0, f"dtype = {dt}")
def test_where_on_gated_load_fold(self):
ridx0 = UOp.range(100, 0)
d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
ld = d0.index(ridx0, ridx0<50).load()
w = (ridx0<50).where(ld, 5)
uops = to_uops_list([w])
for u in uops:
assert u.op is not Ops.WHERE
if u.op is Ops.LOAD: assert u.src[1].arg==5
def test_where_on_gated_load_folds_swapped_branches(self):
ridx0 = UOp.range(100, 0)
d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
ld = d0.index(ridx0, (ridx0<50).logical_not()).load()
w = (ridx0<50).where(5, ld)
uops = to_uops_list([w])
for u in uops:
assert u.op is not Ops.WHERE
if u.op is Ops.LOAD: assert u.src[1].arg==5
def test_where_in_store_becomes_gate(self):
ridx0 = UOp.range(100, 0)
d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
idx = d0.index(ridx0)
ld = idx.load()
val = (ridx0<50).where(5, ld)
st = idx.store(val, ridx0)
uops = to_uops_list([st])
for u in uops:
assert u.op is not Ops.WHERE
if u.op is Ops.STORE: assert u.src[1].arg==5
def test_load_idx_becomes_int(self):
d0 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 0)
d1 = UOp(Ops.DEFINE_GLOBAL, dtypes.long.ptr(), (), 1)
l0 = UOp(Ops.LOAD, dtypes.long, (d0.index(UOp.const(dtypes.int, 0)),))
l0 = UOp(Ops.LOAD, dtypes.long, (d0.index(UOp.const(dtypes.int, 0)),)).cast(dtypes.index)
idx = l0 * 600
valid = (l0<-1).ne(True)&(l0<3000)
l1 = UOp(Ops.LOAD, dtypes.long, (d1.index(idx, valid),))
l1 = UOp(Ops.LOAD, dtypes.long, (d1.index(idx.valid(valid)),))
uops = to_uops_list([l1])
for u in uops:
if u.op is Ops.INDEX: self.assertEqual(u.src[1].dtype, dtypes.int)
@ -560,7 +592,7 @@ class TestUOpGraph(unittest.TestCase):
glbl1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), (), 1)
glbl2 = UOp(Ops.DEFINE_GLOBAL, dtypes.int.ptr(), (), 2)
idx = UOp.const(dtypes.int, 0)
ld0 = UOp(Ops.LOAD, dtypes.int, (glbl1.index(idx, UOp.const(dtypes.bool, False)),))
ld0 = UOp(Ops.LOAD, dtypes.int, (glbl1.index(UOp.invalid()),))
ld1 = UOp(Ops.LOAD, dtypes.int, (glbl2.index(idx, UOp.const(dtypes.bool, True)),))
uops = to_uops_list([UOp(Ops.STORE, dtypes.void, (glbl0.index(idx), ld1+ld0))])
ld0 = uops[-1].src[-1]
@ -573,7 +605,7 @@ class TestUOpGraph(unittest.TestCase):
lidx = UOp(Ops.SPECIAL, dtypes.int, (UOp.const(dtypes.int, 16),), "lidx0")
st = UOp(Ops.STORE, dtypes.void, (smem.index(lidx), UOp.load(glbl0.index(lidx), dtype=dtypes.int)))
barrier = UOp(Ops.BARRIER, dtypes.void, (st, ))
ld0 = UOp(Ops.LOAD, dtypes.int, (smem.index(lidx+1, UOp.const(dtypes.bool, False)), barrier))
ld0 = UOp(Ops.LOAD, dtypes.int, (smem.index(UOp.invalid()), barrier))
ld1 = UOp(Ops.LOAD, dtypes.int, (smem.index(lidx+2, UOp.const(dtypes.bool, True)), barrier))
uops = to_uops_list([UOp(Ops.STORE, dtypes.void, (glbl0.index(lidx), ld1+ld0))])
@ -586,7 +618,7 @@ class TestUOpGraph(unittest.TestCase):
idx0 = UOp.const(dtypes.int, 0)
idx1 = UOp.const(dtypes.int, 0)
val = UOp.const(dtypes.int, 42)
st0 = glbl.index(idx0, UOp.const(dtypes.bool, False)).store(val)
st0 = glbl.index(UOp.invalid()).store(val)
st1 = glbl.index(idx0, UOp.const(dtypes.bool, True)).store(val)
uops = to_uops_list([st0, st1])
# only the second store happens

View file

@ -15,6 +15,7 @@ from tinygrad.codegen import full_rewrite
from tinygrad.uop.symbolic import sym
from tinygrad.device import is_dtype_supported
from tinygrad.codegen.opt import Opt, OptOps
from tinygrad.renderer.ptx import PTXRenderer
def to_uops_list(u:list[UOp], opts=None, skip_check=False) -> list[UOp]: return full_rewrite(UOp.sink(*u), opts)
@ -130,9 +131,9 @@ class TestFloatUOps(TestUOps):
class TestNonFloatUOps(TestUOps):
def test_add_int32(self): self._test_bop_fxn(Ops.ADD, lambda a,b: int(a)+int(b), (dtypes.int32, dtypes.int32))
def test_mul_int32(self): self._test_bop_fxn(Ops.MUL, lambda a,b: int(a)*int(b), (dtypes.int32, dtypes.int32))
@unittest.skipUnless(getenv("PTX"), "only ptx uses bitshifts")
@unittest.skipUnless(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "only ptx uses bitshifts")
def test_shr_int32(self): self._test_bop_fxn(Ops.SHR, lambda a,b: int(a)>>int(b), (dtypes.int32, dtypes.int32), no_b_neg=True)
@unittest.skipUnless(getenv("PTX"), "only ptx uses bitshifts")
@unittest.skipUnless(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "only ptx uses bitshifts")
def test_shl_int32(self): self._test_bop_fxn(Ops.SHL, lambda a,b: int(a)<<int(b), (dtypes.int32, dtypes.int32), no_b_neg=True)
def test_div_int32(self):
self._test_bop_fxn(Ops.IDIV, lambda a,b: int(a/b), (dtypes.int32, dtypes.int32), no_b_zero=True)
@ -370,7 +371,7 @@ class TestLocalAccess(unittest.TestCase):
sres = uop(uops, Ops.LOAD, dtypes.int32, (smem.index(ofs),))
self.assertEqual(_test_uops_result(dtypes.int32, uops, sres), 42)
@unittest.skipUnless(getenv("PTX"), "This only tests assembly backends")
@unittest.skipUnless(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "This only tests assembly backends")
class TestAssembly(unittest.TestCase):
def test_bitshift_left(self):
g1 = UOp(Ops.DEFINE_GLOBAL, dtypes.int32.ptr(), (), 0)
@ -512,7 +513,7 @@ class TestUOpStr(unittest.TestCase):
assert str(eval(str(vec))) == str(vec)
def test_device_arg(self):
device = UOp(Ops.DEVICE, arg="GPU")
device = UOp(Ops.DEVICE, arg="CL")
assert str(eval(str(device))) == str(device)
def test_reduceop_arg(self):

View file

@ -8,6 +8,7 @@ from tinygrad.uop.ops import Ops, UOp
from tinygrad.dtype import dtypes
from tinygrad.codegen.opt import Opt, OptOps, KernelOptError
from tinygrad.device import Device
from tinygrad.renderer.ptx import PTXRenderer
def flops_mem(uops, ignore_indexing=False):
est = Estimates.from_uops(uops, ignore_indexing)
@ -158,7 +159,7 @@ class TestUOpsStats(unittest.TestCase):
self.assertEqual(flops_mem(uops), flops_mem(uops_fma))
N = 64
@unittest.skipIf(getenv("PTX"), "wrong in PTX") # maybe?
@unittest.skipIf(isinstance(Device[Device.DEFAULT].renderer, PTXRenderer), "wrong in PTX") # maybe?
class TestStatsOptimized(unittest.TestCase):
@classmethod
def setUpClass(cls):

View file

@ -2,19 +2,19 @@
import unittest, os, subprocess, sys
from tinygrad import Tensor
from tinygrad.device import Device, Compiler
from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context
from tinygrad.helpers import diskcache_get, diskcache_put, getenv, Context, WIN, CI
class TestDevice(unittest.TestCase):
def test_canonicalize(self):
self.assertEqual(Device.canonicalize(None), Device.DEFAULT)
self.assertEqual(Device.canonicalize("CPU"), "CPU")
self.assertEqual(Device.canonicalize("cpu"), "CPU")
self.assertEqual(Device.canonicalize("GPU"), "GPU")
self.assertEqual(Device.canonicalize("GPU:0"), "GPU")
self.assertEqual(Device.canonicalize("gpu:0"), "GPU")
self.assertEqual(Device.canonicalize("GPU:1"), "GPU:1")
self.assertEqual(Device.canonicalize("gpu:1"), "GPU:1")
self.assertEqual(Device.canonicalize("GPU:2"), "GPU:2")
self.assertEqual(Device.canonicalize("CL"), "CL")
self.assertEqual(Device.canonicalize("CL:0"), "CL")
self.assertEqual(Device.canonicalize("cl:0"), "CL")
self.assertEqual(Device.canonicalize("CL:1"), "CL:1")
self.assertEqual(Device.canonicalize("cl:1"), "CL:1")
self.assertEqual(Device.canonicalize("CL:2"), "CL:2")
self.assertEqual(Device.canonicalize("disk:/dev/shm/test"), "DISK:/dev/shm/test")
self.assertEqual(Device.canonicalize("disk:000.txt"), "DISK:000.txt")
@ -28,6 +28,51 @@ class TestDevice(unittest.TestCase):
self.assertEqual(Device.canonicalize(None), device)
Device.DEFAULT = device
@unittest.skipIf(WIN and CI, "skipping windows test") # TODO: subproccess causes memory violation?
def test_env_overwrite_default_compiler(self):
expect_failure = "\ntry: assert Device[Device.DEFAULT].compiler is None;\nexcept RuntimeError: pass"
if Device.DEFAULT == "CPU":
from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler
try: _, _ = CPULLVMCompiler(), ClangJITCompiler()
except Exception as e: self.skipTest(f"skipping compiler test: not all compilers: {e}")
imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_cpu import CPULLVMCompiler, ClangJITCompiler"
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "1"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, ClangJITCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_LLVM": "0"})
subprocess.run([f'python3 -c "{imports}; {expect_failure}"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CLANGJIT": "0", "CPU_LLVM": "0"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, CPULLVMCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CLANGJIT": "0"})
subprocess.run([f'python3 -c "{imports}; {expect_failure}"'],
shell=True, check=True, env={**os.environ, "DEV": "CPU", "CPU_CLANGJIT": "1", "CPU_LLVM": "1"})
elif Device.DEFAULT == "AMD":
from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler
try: _, _ = HIPCompiler(Device[Device.DEFAULT].arch), AMDLLVMCompiler(Device[Device.DEFAULT].arch)
except Exception as e: self.skipTest(f"skipping compiler test: not all compilers: {e}")
imports = "from tinygrad import Device; from tinygrad.runtime.support.compiler_amd import HIPCompiler, AMDLLVMCompiler"
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, AMDLLVMCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "1"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_LLVM": "0"})
subprocess.run([f'python3 -c "{imports}; assert isinstance(Device[Device.DEFAULT].compiler, HIPCompiler)"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_HIP": "1"})
subprocess.run([f'python3 -c "{imports}; {expect_failure}"'],
shell=True, check=True, env={**os.environ, "DEV": "AMD", "AMD_HIP": "1", "AMD_LLVM": "1"})
else: self.skipTest("only run on CPU/AMD")
def test_compiler_envvar(self):
d = Device[Device.DEFAULT]
dname = Device.DEFAULT.split(':')[0].upper()
assert d._get_compiler_envvar(type("Compiler", (), {})) == f"{dname}_COMPILER"
assert d._get_compiler_envvar(type("LLVMCompiler", (), {})) == f"{dname}_LLVM"
assert d._get_compiler_envvar(type("RandomCompiler", (), {})) == f"{dname}_RANDOM"
assert d._get_compiler_envvar(type(f"{dname}Compiler", (), {})) == f"{dname}_{dname}COMPILER" # do not repeat device name alone
assert d._get_compiler_envvar(type(f"{dname}LLVMCompiler", (), {})) == f"{dname}_LLVM" # do not repeat device name
class MockCompiler(Compiler):
def __init__(self, key): super().__init__(key)
def compile(self, src) -> bytes: return src.encode()
@ -56,7 +101,7 @@ class TestCompiler(unittest.TestCase):
class TestRunAsModule(unittest.TestCase):
def test_module_runs(self):
p = subprocess.run([sys.executable, "-m", "tinygrad.device"],stdout=subprocess.PIPE, stderr=subprocess.PIPE,
env={**os.environ, "DEBUG": "1"}, timeout=10,)
env={**os.environ, "DEBUG": "1"}, timeout=30,)
out = (p.stdout + p.stderr).decode()
self.assertEqual(p.returncode, 0, msg=out)
self.assertIn("CPU", out) # for sanity check

View file

@ -307,7 +307,7 @@ class TestDiskTensor(unittest.TestCase):
ret = t.bitcast(dtypes.uint16).to("CPU") + 1
assert ret.tolist() == [2827, 3341, 3855, 4369]
@unittest.skipIf(OSX, "new LLVM has an issue on OSX")
@unittest.skipIf(OSX or Device.DEFAULT == "CL", "new LLVM has an issue on OSX, CL=1 gives the wrong output")
def test_bf16_disk_write_read(self):
t = Tensor([10000, -1, -1000, -10000, 20], dtype=dtypes.float32)
t.to(f"disk:{temp('dt_bf16_disk_write_read_f32')}").realize()

View file

@ -1,12 +1,11 @@
import unittest, math, operator, subprocess, struct
from tinygrad.tensor import Tensor, dtypes, Device
from tinygrad.dtype import DType, DTYPES_DICT, truncate, truncate_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float
from tinygrad.dtype import DType, DTYPES_DICT, truncate, float_to_fp16, float_to_bf16, _to_np_dtype, least_upper_dtype, least_upper_float
from tinygrad.device import is_dtype_supported
from tinygrad.helpers import getenv, CI, DEBUG
from hypothesis import given, settings, strategies as strat
import numpy as np
import torch
import ml_dtypes
settings.register_profile("my_profile", max_examples=200, deadline=None, derandomize=getenv("DERANDOMIZE_CI", False))
settings.load_profile("my_profile")
@ -22,7 +21,9 @@ def _assert_eq(tensor:Tensor, target_dtype:DType, target, tol_target_dtype:float
if DEBUG >= 2: print(tensor.numpy())
try:
assert tensor.dtype == target_dtype
np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2}.get(target_dtype, tol_target_dtype))
np.testing.assert_allclose(tensor.numpy(), target, rtol={dtypes.float16:1e-3, dtypes.bfloat16:1e-2,
dtypes.fp8e4m3:1e-1, dtypes.fp8e5m2:5e-1}.get(target_dtype, tol_target_dtype))
except AssertionError as e:
raise AssertionError(f"\ntensor {tensor.numpy()} dtype {tensor.dtype} does not match target {target} with dtype {target_dtype}") from e
@ -105,16 +106,16 @@ class TestHelpers(unittest.TestCase):
self.assertEqual(dt.min, dt.vec(4).min)
self.assertEqual(dt.max, dt.vec(4).max)
def test_truncate_fp16(self):
self.assertEqual(truncate_fp16(1), 1)
self.assertEqual(truncate_fp16(65504), 65504)
self.assertEqual(truncate_fp16(65519.999), 65504)
self.assertEqual(truncate_fp16(65520), math.inf)
self.assertEqual(truncate_fp16(1e-8), 0.0)
self.assertEqual(truncate_fp16(-65504), -65504)
self.assertEqual(truncate_fp16(-65519.999), -65504)
self.assertEqual(truncate_fp16(-65520), -math.inf)
self.assertTrue(math.isnan(truncate_fp16(math.nan)))
def test_float_to_fp16(self):
self.assertEqual(float_to_fp16(1), 1)
self.assertEqual(float_to_fp16(65504), 65504)
self.assertEqual(float_to_fp16(65519.999), 65504)
self.assertEqual(float_to_fp16(65520), math.inf)
self.assertEqual(float_to_fp16(1e-8), 0.0)
self.assertEqual(float_to_fp16(-65504), -65504)
self.assertEqual(float_to_fp16(-65519.999), -65504)
self.assertEqual(float_to_fp16(-65520), -math.inf)
self.assertTrue(math.isnan(float_to_fp16(math.nan)))
def test_float_to_bf16(self):
# TODO: fuzz this better
@ -190,7 +191,7 @@ class TestHelpers(unittest.TestCase):
elif math.isinf(x): np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), math.copysign(math.nan, x))
elif x > FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), FP8E4M3_MAX)
elif x < -FP8E4M3_MAX: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), -FP8E4M3_MAX)
else: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), ml_dtypes.float8_e4m3fn(x))
else: np.testing.assert_equal(truncate[dtypes.fp8e4m3](x), torch.tensor(x, dtype=torch.float8_e4m3fn).float().item())
@given(strat.floats(width=32, allow_subnormal=True, allow_nan=True, allow_infinity=True))
def test_truncate_fp8e5m2(self, x):
@ -198,7 +199,7 @@ class TestHelpers(unittest.TestCase):
elif math.isinf(x): np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), x)
elif x > FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), FP8E5M2_MAX)
elif x < -FP8E5M2_MAX: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), -FP8E5M2_MAX)
else: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), ml_dtypes.float8_e5m2(x))
else: np.testing.assert_equal(truncate[dtypes.fp8e5m2](x), torch.tensor(x, dtype=torch.float8_e5m2).float().item())
class TestTypeSpec(unittest.TestCase):
def setUp(self):
@ -378,7 +379,7 @@ class TestTypePromotion(unittest.TestCase):
assert least_upper_dtype(dtypes.int32, dtypes.uint32) == dtypes.int64
assert least_upper_dtype(dtypes.uint32, dtypes.int64) == dtypes.int64
# similar to jax but we don't use weak type
assert least_upper_dtype(dtypes.int64, dtypes.uint64) == dtypes.float16
assert least_upper_dtype(dtypes.int64, dtypes.uint64) == dtypes.fp8e4m3
assert least_upper_dtype(dtypes.float16, dtypes.float32) == dtypes.float32
assert least_upper_dtype(dtypes.float32, dtypes.float64) == dtypes.float64
@ -387,6 +388,14 @@ class TestTypePromotion(unittest.TestCase):
assert least_upper_dtype(dtypes.float16, dtypes.int64) == dtypes.float16
assert least_upper_dtype(dtypes.float16, dtypes.uint64) == dtypes.float16
assert least_upper_dtype(dtypes.fp8e4m3, dtypes.fp8e5m2) == dtypes.half
assert least_upper_dtype(dtypes.fp8e4m3, dtypes.bfloat16) == dtypes.bfloat16
assert least_upper_dtype(dtypes.fp8e5m2, dtypes.bfloat16) == dtypes.bfloat16
assert least_upper_dtype(dtypes.fp8e4m3, dtypes.float16) == dtypes.float16
assert least_upper_dtype(dtypes.fp8e5m2, dtypes.float16) == dtypes.float16
assert least_upper_dtype(dtypes.fp8e4m3, dtypes.int64) == dtypes.fp8e4m3
assert least_upper_dtype(dtypes.fp8e4m3, dtypes.uint64) == dtypes.fp8e4m3
assert least_upper_dtype(dtypes.fp8e5m2, dtypes.int64) == dtypes.fp8e5m2
assert least_upper_dtype(dtypes.fp8e5m2, dtypes.uint64) == dtypes.fp8e5m2
class TestAutoCastType(unittest.TestCase):
def setUp(self):
@ -569,10 +578,10 @@ class TestAutoCastType(unittest.TestCase):
def test_gradient_dtype(self):
old_default_float = dtypes.default_float
for default_dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]:
for default_dtype in dtypes.floats:
if not is_dtype_supported(default_dtype): continue
dtypes.default_float = default_dtype
for dtype in [dtypes.float16, dtypes.bfloat16, dtypes.float32, dtypes.float64]:
for dtype in dtypes.floats:
if not is_dtype_supported(dtype): continue
if DEBUG >= 2:
print(f"testing {default_dtype=}, {dtype=}")

View file

@ -67,7 +67,7 @@ class TestGGUF(unittest.TestCase):
return np.array([E] + packed, dtype=np.uint8)
def decode(code, E):
sign = -1.0 if code * 0b1000 else 1.0
sign = -1.0 if (code & 0b1000) else 1.0
exp = (code >> 1) & 0b11
mant = code & 0b1
val = (1.0 + 0.5 * mant) * np.exp2(exp - 1) if exp else 0.5 * mant
@ -83,7 +83,8 @@ class TestGGUF(unittest.TestCase):
expected.extend(decode(c, E) for c in codes)
tensor = Tensor(np.concatenate(blocks))
out = ggml_data_to_tensor(tensor, len(expected), MXFP4)
self.assertListEqual(out.numpy().tolist(), np.array(expected, dtype=np.float32).tolist())
# TODO: should this be exact equal? somehow failed on CI
np.testing.assert_allclose(out.numpy(), expected, atol=0.0, rtol=1e-6)
def test_expected_failure_unknown_type(self):
with self.assertRaises(ValueError):

View file

@ -181,7 +181,7 @@ class TestIndexing(unittest.TestCase):
# self.assertRaises(TypeError, delitem)
# TODO: LLVM is quite fast, why are other compiled backends slow?
@unittest.skipIf(CI and Device.DEFAULT in ["CPU", "GPU", "METAL", "NV", "AMD"], "slow")
@unittest.skipIf(CI and Device.DEFAULT in ["CPU", "CL", "METAL", "NV", "AMD"], "slow")
def test_advancedindex(self):
# integer array indexing

View file

@ -23,7 +23,6 @@ class TestLinearizerRewrite(unittest.TestCase):
si = out.schedule()[-1]
opts_to_apply = []
opts_to_apply.append(Opt(OptOps.UPCAST, 0, 4))
opts_to_apply.append(Opt(OptOps.UNROLL, 0, 4))
ast = si.ast.replace(arg=KernelInfo(opts_to_apply=tuple(opts_to_apply)))
prg = get_program(ast, Device["CPU"].renderer)
print(prg.src)

View file

@ -1,7 +1,7 @@
#!/usr/bin/env python
import unittest
import numpy as np
from tinygrad.dtype import dtypes
from tinygrad.dtype import dtypes, Invalid
from tinygrad.helpers import prod
from tinygrad.shape.shapetracker import ShapeTracker, View
from tinygrad import Variable
@ -10,7 +10,8 @@ from tinygrad.codegen.late.devectorizer import sym
from itertools import product
def shapetracker_getitem(st:ShapeTracker, val:int):
idx, valid = st.reshape((st.size,)).to_indexed_uops([UOp.const(dtypes.int, val)])
valid_idx = st.reshape((st.size,)).to_valid_uop([UOp.const(dtypes.int, val)])
idx, valid = valid_idx.get_idx(), valid_idx.get_valid()
idx, valid = graph_rewrite(idx, sym), graph_rewrite(valid, sym)
assert idx.op is Ops.CONST and valid.op is Ops.CONST
return idx.arg, valid.arg
@ -68,7 +69,7 @@ class CheckingShapeTracker:
def contiguous(self): return self.st.contiguous
def assert_same(self):
x = [(v[0] if (v:=shapetracker_getitem(self.st, i))[1] else -1) for i in range(prod(self.st.shape))]
x = [(v[0] if (v:=shapetracker_getitem(self.st, i))[1] and v[0] is not Invalid else -1) for i in range(prod(self.st.shape))]
y = [self[i] for i in range(prod(self.shape))]
assert self.st.shape == self.shape
assert x == y, f"mismatch shapetracker:{x} real:{y}"
@ -154,7 +155,7 @@ class TestRealStrides(unittest.TestCase):
View.create((1, 3, 22, 21), (0, 192, 16, 1), 0, ((0, 1), (0, 3), (0, 12), (0, 16))),
View.create((3, 11, 7, 2, 3), (462, 21, 1, 231, 7), 0, None),
))
self.assertEqual(st.real_strides(), (132, None, None, None, None))
self.assertEqual(st.real_strides(), (132, 12, None, None, None))
class TestRealSimplifies(unittest.TestCase):
def tearDown(self):
@ -619,20 +620,6 @@ class TestMaskedShapeTracker(unittest.TestCase):
st3.reshape((4, 3, 6, 5))
st3.assert_same()
def test_axis_is_masked(self):
st = ShapeTracker.from_shape((100, 100, 100, 100)).pad(((0,1),(0,0),(2,0), (0,0)))
assert st.axis_is_masked(0)
assert not st.axis_is_masked(1)
assert st.axis_is_masked(2)
assert not st.axis_is_masked(3)
def test_axis_is_masked_rw1(self):
st = ShapeTracker(views=(View(shape=(1, 2, 1, 4, 4, 13, 4, 13), strides=(0, 324, 0, 81, 0, 9, 0, 1), offset=-20,
mask=((0, 1), (0, 2), (0, 1), (0, 4), (0, 4), (2, 11), (0, 4), (2, 11)), contiguous=False),
View(shape=(2, 4, 11, 11, 4, 3, 3), strides=(10816, 0, 52, 1, 2704, 728, 14), offset=0,
mask=None, contiguous=False)))
assert not st.axis_is_masked(0)
class TestShapeTracker(unittest.TestCase):
def setUp(self):
self.st = CheckingShapeTracker((7,4))
@ -830,12 +817,14 @@ class TestShapeTrackerSize(unittest.TestCase):
class TestRender(unittest.TestCase):
def test_render(self):
st = ShapeTracker.from_shape((2, 3))
idx, valid = st.to_indexed_uops()
valid_idx = st.to_valid_uop()
idx, valid = valid_idx.get_idx(), valid_idx.get_valid()
self.assertEqual(idx.render(), "((ridx0*3)+ridx1)")
self.assertEqual(valid.render(), "True")
st = st.pad(((0, 1), (0, 0)))
idx, valid = st.to_indexed_uops()
valid_idx = st.to_valid_uop()
idx, valid = valid_idx.get_idx(), valid_idx.get_valid()
self.assertEqual(idx.render(), "((ridx0*3)+ridx1)")
self.assertEqual(valid.render(), "(ridx0<2)")

View file

@ -8,13 +8,13 @@ from tinygrad.helpers import Context
def get_gated_load_uop(valid:UOp, idx:UOp):
return UOp(Ops.LOAD, dtypes.float, (
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0).index(idx, valid),
UOp(Ops.DEFINE_GLOBAL, dtypes.float.ptr(), arg=0).index(idx.valid(valid)),
UOp.const(dtypes.float, 0.0)
))
def get_load_image_uop(image_shape:tuple[int, ...], valid:UOp, idx:tuple[UOp, UOp]):
return UOp(Ops.LOAD, dtypes.float.vec(4), (
UOp(Ops.DEFINE_GLOBAL, dtypes.imagef(image_shape), arg=0).index(UOp(Ops.VECTORIZE, dtypes.int.vec(2), idx), valid),
UOp(Ops.DEFINE_GLOBAL, dtypes.imagef(image_shape), arg=0).index(UOp(Ops.VECTORIZE, dtypes.index.vec(2), idx).valid(valid)),
UOp(Ops.VECTORIZE, dtypes.float.vec(4), src=(UOp.const(dtypes.float, 0.0),) * 4)
))
@ -359,7 +359,7 @@ class TestImageSimplification(unittest.TestCase):
self.check(load, None, "((gidx*3)+-1438)", "0")
def test_simplify2(self):
# from GPU=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d
# from CL=1 DEBUG=4 FORWARD_ONLY=1 IMAGE=2 python3 test/test_ops.py TestOps.test_simple_padding_conv2d
lidx = Special("lidx", 4)
valid = (lidx<3) & (lidx<1).ne(True)
idx = ((lidx+1)%2, (lidx+1)//2-1)

View file

@ -13,7 +13,6 @@ class TestSymbolic(unittest.TestCase):
assert st.shape == (x, 3)
assert st.real_strides() == (3, 1)
@unittest.expectedFailure
def test_real_strides_0(self):
st = ShapeTracker(views=(View(shape=(2, (Variable('start_pos', 1, 8)+1), 1, 1), strides=(8, 1, 0, 0), offset=0, mask=((0, 2), (0, Variable('start_pos', 1, 8)), (0, 1), (0, 1)), contiguous=False), View(shape=(2, (Variable('start_pos', 1, 8)+1)), strides=((Variable('start_pos', 1, 8)+1), 1), offset=0, mask=None, contiguous=True))) # noqa: E501
self.assertEqual(st.real_strides(), (8, None))
@ -198,7 +197,7 @@ class TestSymbolicPad(unittest.TestCase):
def test_pad(self):
v = Variable("v", 1, 100).bind(5)
t = Tensor.ones(100)[:v].pad(((4, 0),))
t = t.reshape(9)
t = t[:9]
assert t.tolist() == [0,0,0,0,1,1,1,1,1]

View file

@ -34,7 +34,7 @@ class TestTensorMutates(unittest.TestCase):
is_pattern_uop(c.uop.base, realized_pattern)
# NOTE: we keep movement ops on top of the buffer view
is_pattern_uop(c.uop, UPat(Ops.BUFFER))
is_pattern_uop(d.uop, UPat(Ops.VIEW, src=(realized_pattern,)))
assert d.uop is not d.uop.base
def test_reshape_is_same_child(self):
a = Tensor([1,2,3])
@ -58,40 +58,6 @@ class TestTensorUopRepresentation(unittest.TestCase):
print(c.uop)
is_pattern(c, UPat(Ops.ADD, src=(realized_pattern, realized_pattern)))
def test_const_pattern(self):
a = Tensor(1)
print(a.uop)
is_pattern(a, const_pattern) # const in tensor has a DEVICE and VIEW src
is_pattern(a, UPat.cvar("x")) # even cvar works!
def test_consts_do_not_realize(self):
a = Tensor(1)
print(a.uop)
pre_realize = a.uop
a.realize()
assert a.uop is pre_realize
def test_viewed_consts_do_not_realize(self):
a = Tensor.ones(10, 10)
print(a.uop)
a.realize()
is_pattern(a, const_pattern)
self.assertEqual(a.uop.shape, (10, 10))
# CONST is EXPAND -> RESHAPE -> CONST -> DEVICE
def test_consts_dont_have_buffers(self):
a = Tensor.ones(10, 10)
buffers_in_parents = [x.op for x in a.uop.toposort() if x.op is Ops.BUFFER]
self.assertEqual(len(buffers_in_parents), 0)
is_pattern(a, UPat(Ops.EXPAND, src=(UPat(Ops.RESHAPE, src=(const_pattern,)),)))
# COPY has a copyin source and a device.
def test_copyin(self):
a = Tensor([1.,2,3]).realize()
c = a.to("TEST") # NOTE: this isn't checked
print(c.uop)
is_pattern(c, UPat(Ops.COPY, src=(realized_pattern, UPat(Ops.DEVICE)), arg=None))
def test_empty_buf(self):
a = Tensor.empty(3, 3)
is_pattern(a, UPat(Ops.RESHAPE, src=(UPat(Ops.BUFFER),)))

View file

@ -2,7 +2,7 @@
import unittest, pickle, functools, math
import z3
from tinygrad.dtype import dtypes, ConstType, DType
from tinygrad.dtype import dtypes, ConstType, DType, Invalid
from tinygrad.codegen import full_rewrite
from tinygrad.helpers import Context
from tinygrad.uop.ops import UOp, Ops, graph_rewrite, sym_infer, track_rewrites
@ -93,6 +93,37 @@ class TestSymbolic(unittest.TestCase):
assert idx1+idx2 is not idx2
assert idx1*idx2 is not idx2*idx1
def test_uop_gcd_method(self):
a = Variable("a", 0, 8)
b = Variable("b", 0, 8)
self.assertEqual(UOp.gcd(a, a*b, a*3).simplify(), a)
self.assertEqual(UOp.gcd(a*a*a, a*b*a, a*3*a).simplify(), a*a)
self.assertEqual(UOp.gcd(a*a*10, b*a*5, a*a*5).simplify(), a*5)
self.assertEqual(UOp.gcd(a*10, b*5, a*5).simplify(), a.const_like(5))
self.assertEqual(UOp.gcd(a, b*5, a*5).simplify(), a.const_like(1))
def test_divides_exact(self):
a = Variable("a", 1, 8)
b = Variable("b", 1, 8)
self.assertEqual((a*a*3).divide_exact(a).simplify(), a*3)
self.assertEqual((a*a*3).divide_exact(a*a*3).simplify(), a.const_like(1))
self.assertEqual((a*b*3).divide_exact(a.const_like(3)).simplify(), a*b)
self.assertEqual((a*a*3).divide_exact(a*a.const_like(-3)).simplify(), a*-1)
self.assertEqual((a*a*b*3).divide_exact(a*b).simplify(), a*3)
self.assertEqual((a*3+a*b).divide_exact(a).simplify(), b+3)
self.assertEqual((a*b*3+a*b*b).divide_exact(a*b).simplify(), b+3)
self.assertEqual((((a*-2)+14)*b).divide_exact(((a*-2)+14)).simplify(), b)
def test_divide_exact_not(self):
a = Variable("a", 1, 8)
b = Variable("b", 1, 8)
x = Variable("x", -20, 0)
self.assertEqual((a).divide_exact(b), None)
self.assertEqual((a+2).divide_exact(a), None)
self.assertEqual((x*-1).divide_exact(a), None)
self.assertEqual((a*5).divide_exact(a*10), None)
self.assertEqual((a*10-1).divide_exact(a*10), None)
def test_factorize(self):
a = Variable("a", 0, 8)
b = Variable("b", 0, 8)
@ -110,7 +141,7 @@ class TestSymbolic(unittest.TestCase):
self.helper_test_variable(-Variable("a", 0, 8), -8, 0, "(a*-1)")
def test_xor_0(self):
self.helper_test_variable(Variable("a", 0, 8, dtypes.int) ^ 0, 0, 8, "a")
self.helper_test_variable(Variable("a", 0, 8, dtypes.int) ^ 0, 0, 8, "a", test_z3=False)
def test_add_1(self):
self.helper_test_variable(Variable("a", 0, 8)+1, 1, 9, "(a+1)")
@ -374,6 +405,16 @@ class TestSymbolic(unittest.TestCase):
def test_and_remove(self):
self.helper_test_variable(uand([uconst(1), Variable("a", 0, 1)]), 0, 1, "a")
def test_bool_or_not_tautology(self):
a = Variable("a", 0, 10)
c = a<10
self.helper_test_variable(c | c.logical_not(), True, True, "True")
def test_bool_and_not_contradiction(self):
a = Variable("a", 0, 10)
c = a<10
self.helper_test_variable(c & c.logical_not(), False, False, "False")
def test_mod_factor_negative(self):
self.helper_test_variable(usum([uconst(-29), Variable("a", 0, 10), Variable("b", 0, 10)*28]) % 28, -27, 27, "(((a+(b*28))+-29)%28)")
self.helper_test_variable(usum([uconst(-29), Variable("a", 0, 100), Variable("b", 0, 10)*28]) % 28, -27, 27, "(((a+(b*28))+-29)%28)")
@ -440,6 +481,33 @@ class TestSymbolic(unittest.TestCase):
def test_mul_div_factor_div_neg(self):
self.helper_test_variable((Variable("a", 0, 10)*-4+4)//8, -4, 0, "(((a*-1)+1)//2)")
def test_div_symbolic_const_gcd(self):
a = Variable("a", -10, 10)
b = Variable("b", -10, 10)
d = Variable("d", 1, 10)
self.helper_test_variable((3*a+9*b)//(3*d), -40, 40, "((a+(b*3))//d)")
def test_symbolic_gcd_div(self):
a = Variable("a", -10, 10)
b = Variable("b", -10, 10)
c = Variable("c", -10, 10)
d1 = Variable("d1", 1, 10)
d2 = Variable("d2", -10, -1)
self.helper_test_variable((d1*a*b*d1)//(d1), -1000, 1000, "(a*(b*d1))")
self.helper_test_variable((d1*a*d2*b*d1)//(d1*d2), -1000, 1000, "(a*(b*d1))")
self.helper_test_variable((d1*a + b*d1)//(d1), -20, 20, "(a+b)")
self.helper_test_variable((d1*a + b*d1 + c*d1)//(d1), -30, 30, "(c+(a+b))")
self.helper_test_variable((3*a*d1 + 9*b*d1)//(3*d1*d2), -40, 40, "(((a+(b*3))//(d2*-1))*-1)")
self.helper_test_variable((3*a*d1 + 9*b*d1+3)//(3*d1*d2), -401, 399, "(((((a*d1)+((b*d1)*3))+1)//((d1*d2)*-1))*-1)")
def test_symbolic_factor_remainder_div(self):
a = Variable("a", 0, 10)
b = Variable("b", 0, 10)
d = Variable("d", 1, 10)
self.helper_test_variable((d*a+b)//d, 0, 20, "(a+(b//d))")
self.helper_test_variable((d*a*20+b)//(5*d), 0, 42, "((a*4)+(b//(d*5)))")
self.helper_test_variable((d*a*20+b*d*5+10)//(5*d), 0, 52, "((b+(a*4))+(2//d))")
def test_mod_gcd_factor_neg(self):
self.helper_test_variable((Variable("a", 0, 10)*-4+4)%8, -4, 4, "((((a*-1)+1)%2)*4)")
@ -510,6 +578,13 @@ class TestSymbolic(unittest.TestCase):
self.helper_test_variable((gidx0*4+lidx2*2+lidx3)//12, 0, 4, ("(((lidx2//2)+gidx0)//3)", "((gidx0+(lidx2//2))//3)"))
self.helper_test_variable((lidx2*2+gidx0*4+lidx3)//12, 0, 4, ("(((lidx2//2)+gidx0)//3)", "((gidx0+(lidx2//2))//3)"))
@unittest.expectedFailure # TODO: improve nest_div_by_smallest_factor
def test_sum_div_complex4(self):
gidx0 = Variable("gidx0", 0, 2)
lidx2 = Variable("lidx2", 0, 12)
lidx3 = Variable("lidx3", 0, 12)
self.helper_test_variable((gidx0*3+lidx2*19+lidx3*38)//(3*19), 0, 12, ("((lidx2+(lidx3*2))//3)"))
def test_sum_mul_distribute(self):
gidx0 = Variable("gidx0", 0, 7)
lidx2 = Variable("lidx2", 0, 12)
@ -581,45 +656,6 @@ class TestSymbolic(unittest.TestCase):
with self.assertRaises(AssertionError):
self.helper_test_variable((30 * b + 1) % 18 + ((30 * b + 1) // 18) * 18, 1, 3001, "((b*30)+1)")
def test_arange_unrolled4(self):
gidx = Variable("gidx", 0, 2559)
unrolled_div = (gidx+2561)//4+(gidx+2562)//4+(gidx+2560)//4+(gidx+2559)//4
self.helper_test_variable(unrolled_div, 2559, 5118, "(gidx+2559)")
def test_arange_unrolled4_with_cast(self):
gidx = Variable("gidx", 0, 2559, dtypes.index)
dt = dtypes.int
unrolled_div = ((gidx+2561)//4 + 2).cast(dt)+((gidx+2562)//4).cast(dt)+((gidx+2560)//4).cast(dt)+((gidx+2559)//4).cast(dt)
self.helper_test_variable(unrolled_div, 2561, 5120, "((int)(gidx)+2561)")
def test_arange_unrolled4_mul(self):
gidx = Variable("gidx", 0, 2559)
unrolled_div = 2*((gidx+2561)//4)+2*((gidx+2562)//4)+2*((gidx+2560)//4)+2*((gidx+2559)//4)
self.helper_test_variable(unrolled_div, 5118, 10236, "((gidx*2)+5118)")
def test_arange_unrolled4_small(self):
gidx = Variable("gidx", 0, 3)
unrolled_div = (gidx)//4+(gidx+2)//4+(gidx+3)//4+(gidx+1)//4
self.helper_test_variable(unrolled_div, 0, 3, "gidx")
gidx = Variable("gidx", 0, 2)
unrolled_div = (gidx)//4+(gidx+2)//4+(gidx+3)//4+(gidx+1)//4
self.helper_test_variable(unrolled_div, 0, 2, "gidx")
gidx = Variable("gidx", 0, 1)
unrolled_div = (gidx)//4+(gidx+2)//4+(gidx+3)//4+(gidx+1)//4
self.helper_test_variable(unrolled_div, 0, 1, "gidx")
def test_arange_unrolled2(self):
gidx = Variable("gidx", 0, 2559)
unrolled_div = (gidx+2559)//2+(gidx+2560)//2+3
self.helper_test_variable(unrolled_div, 2562, 5121, "(gidx+2562)")
def test_arange_unrolled2_neg(self):
ridx = Variable("ridx", 0, 255)
unrolled_div = -((255-ridx)//2) - ((256-ridx)//2)
self.helper_test_variable(unrolled_div, -255, 0, "(ridx+-255)")
def test_gated_load(self):
idx = Variable("idx", 0, 24)
self.helper_test_variable(idx//4, 0, 6, "(idx//4)")
@ -924,6 +960,46 @@ class TestSymbolicSymbolicOps(unittest.TestCase):
assert c == uconst(2)
"""
class TestInvalidIndex(unittest.TestCase):
def test_invalid_times_0(self):
ridx = Variable("ridx", 0, 10)
idx = (ridx<5).where(ridx, UOp.invalid())*0
self.assertIs(idx.simplify(), (ridx<5).where(0, UOp.invalid()), "multiplying an index by 0 should preserve the invalid")
def test_invalid_comparison_drops_invalid(self):
# comparisons return a bool, and bools can't be invalid
ridx = Variable("ridx", 0, 10)
idx = (ridx<5).where(ridx, UOp.invalid())<3
self.assertIs(idx.simplify(), (ridx<3), "comparison of index should drop the invalid")
self.assertIs(idx.where(UOp.const(dtypes.int, 1), 0).simplify(), (ridx<3).where(UOp.const(dtypes.int, 1), 0),
"comparison of index should drop the invalid")
def test_alu_moves_inside_invalid(self):
ridx = Variable("ridx", 0, 10)
idx = (ridx<5).where(ridx, UOp.invalid())*10
self.assertIs(idx.simplify(), (ridx<5).where(ridx*10, UOp.invalid()), "multiplying an index by 0 should preserve the invalid")
def test_merge_invalid_conditions(self):
ridx0 = Variable("ridx0", 0, 10)
ridx1 = Variable("ridx1", 0, 10)
idx0 = (ridx0<5).where(ridx0, UOp.invalid())
idx1 = (ridx1<5).where(idx0//2, UOp.invalid())
self.assertIs(idx1.simplify(), ((ridx1<5)&(ridx0<5)).where(ridx0//2, UOp.invalid()),
"valid inside a valid should make a single valid and & the conditions")
def test_alu_invalid(self):
self.assertIs((UOp.invalid()*2).simplify(), UOp.invalid())
self.assertIs((UOp.invalid()*0).simplify(), UOp.invalid())
self.assertIs((UOp.invalid()+8).simplify(), UOp.invalid())
self.assertIs((UOp.invalid()+Variable("a",0,10)).simplify(), UOp.invalid())
self.assertIs((UOp.invalid()*Variable("a",0,10)).simplify(), UOp.invalid())
self.assertIs((UOp.invalid()<Variable("a",0,10)).simplify().dtype, dtypes.bool)
def test_alu_invalid_vconst(self):
c1 = UOp.const(dtypes.index.vec(4), (1, 1, Invalid, Invalid))
c2 = UOp.const(dtypes.index.vec(4), (1, Invalid, 1, 1))
self.assertIs((c1+c2).simplify(), UOp.const(dtypes.index.vec(4), (2, Invalid, Invalid, Invalid)))
class TestSymbolicRealWorld(unittest.TestCase):
def test_resnet_half(self):
gidx0 = Variable("gidx0", 0, 3)

View file

@ -1,6 +1,6 @@
import unittest, math
from tinygrad.uop.ops import UOp, Ops
from tinygrad.dtype import dtypes
from tinygrad.dtype import dtypes, Invalid
class TestVminVmaxProperties(unittest.TestCase):
def test_vmin_vmax_constant(self):
@ -122,6 +122,15 @@ class TestVminVmaxProperties(unittest.TestCase):
self.assertEqual(x_uint.vmin, dtypes.min(dtypes.uint))
self.assertEqual(x_uint.vmax, dtypes.max(dtypes.uint))
def test_vmin_vmax_invalid(self):
i = UOp.invalid()
self.assertNotEqual(i.vmin, i.vmax)
def test_vmin_vmax_invalid_vconst(self):
x = UOp.const(dtypes.index.vec(4), (0, 4, Invalid, Invalid))
self.assertLess(x.vmin, 0)
self.assertGreater(x.vmax, 4)
class TestVminVmaxDivMod(unittest.TestCase):
def test_vmin_vmax_division_positive(self):
# vmin and vmax for division of a variable by a positive constant

View file

@ -408,7 +408,7 @@ class TestVizProfiler(unittest.TestCase):
get_profile(prof)
def test_python_marker(self):
with Context(PROFILE=1):
with Context(VIZ=1):
a = Tensor.empty(1, device="NULL")
b = Tensor.empty(1, device="NULL")
(a+b).realize()

View file

@ -1,7 +1,7 @@
import unittest, sys
import numpy as np
from tinygrad import Tensor, GlobalCounters, dtypes, Context, nn
from tinygrad.helpers import CI, Profiling, WINO
from tinygrad.helpers import CI, Profiling, WINO, RANGEIFY
@unittest.skipIf(sys.platform.startswith("win"), "flaky on Windows")
class TestWinogradClose(unittest.TestCase):
@ -35,32 +35,35 @@ class TestWinograd(unittest.TestCase):
def test_forward_kernels(self):
x,w = Tensor.rand(1,4,9,9).realize(), Tensor.rand(4,4,3,3).realize()
out = Tensor.conv2d(x,w)
self.assertEqual(len(out.schedule()), 4)
self.assertEqual(len(out.schedule()), 2 if RANGEIFY else 4)
def test_backward_kernels(self):
x,w = Tensor.empty(1,4,9,9,requires_grad=True).realize(), Tensor.empty(4,4,3,3,requires_grad=True).realize()
out = Tensor.conv2d(x,w, padding=1)
out.mean().backward()
backward_schedule = Tensor.schedule(x.grad, w.grad)
self.assertEqual(len(backward_schedule), 9)
self.assertEqual(len(backward_schedule), 6 if RANGEIFY else 9)
def test_counters(self):
IC, OC, X, Y = 4,4,9,9
#OC, IC, X, Y = 512, 256, 8, 8
x,w = Tensor.rand(1,IC,Y,X).realize(), Tensor.rand(OC,IC,3,3).realize()
GlobalCounters.reset()
Tensor.conv2d(x,w).realize()
with Context(WINO=1):
Tensor.conv2d(x,w).realize()
ops_wino, mem_wino = GlobalCounters.global_ops, GlobalCounters.global_mem
WINO.value = 0
GlobalCounters.reset()
Tensor.conv2d(x,w).realize()
with Context(WINO=0):
Tensor.conv2d(x,w).realize()
ops_normal, mem_normal = GlobalCounters.global_ops, GlobalCounters.global_mem
ops_ratio, mem_ratio = ops_wino/ops_normal, mem_wino/mem_normal
print(f"ops: normal {ops_normal:9d} wino {ops_wino:9d} ratio {ops_ratio:.2f}")
print(f"mem: normal {mem_normal:9d} wino {mem_wino:9d} ratio {mem_ratio:.2f}")
self.assertLess(ops_ratio, 2.6) # TODO: there's issues with factorization now
self.assertLess(mem_ratio, 10)
if not RANGEIFY:
self.assertLess(ops_ratio, 2.6) # TODO: there's issues with factorization now
self.assertLess(mem_ratio, 10)
def test_dtype(self):
IC, OC, X, Y = 4,4,9,9

View file

@ -10,7 +10,7 @@ from tinygrad.renderer import Renderer
from tinygrad.codegen.lowerer import pm_lowerer, get_index
from tinygrad.codegen.quantize import pm_quant
from tinygrad.codegen.gpudims import pm_add_gpudims
from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing, cast_folding
from tinygrad.uop.symbolic import sym, symbolic_simple, gep_pushing
from tinygrad.uop.decompositions import get_late_rewrite_patterns
from tinygrad.codegen.late.expander import migrate_indexing, expander, pm_pre_expander
from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_indexing, devectorize, pm_reduce, \
@ -18,8 +18,8 @@ from tinygrad.codegen.late.devectorizer import load_store_folding, load_store_in
from tinygrad.codegen.late.linearize import block_create, pm_blockend_merge, block_merge, pm_finalize, BlockContext
from tinygrad.codegen.opt.swizzler import view_left, view_right, fix_kernel_ops
from tinygrad.codegen.opt.postrange import pm_postrange_opt
from tinygrad.codegen.simplify import pm_simplify_ranges
from tinygrad.schedule.rangeify import pm_add_buffers_local, rangeify_codegen
from tinygrad.codegen.simplify import pm_simplify_ranges, pm_reduce_simplify, pm_flatten_range
from tinygrad.schedule.rangeify import pm_add_buffers, rangeify_codegen
@dataclass
class RewriteStep:
@ -62,10 +62,11 @@ def _get_rewrites_for_renderer(opts:Renderer, optimize:bool, linearizer:bool, _Q
ret.append(RewriteStep(pm_lowerer, get_index, name="lowerer", bottom_up=True))
# symbolic (NOTE: this is a requirement for pm_simplify_ranges to be correct)
ret.append(RewriteStep(sym, name="initial symbolic"))
ret.append(RewriteStep(sym+pm_flatten_range, name="initial symbolic"))
# optimize (schedule) the AST
ret.append(RewriteStep(pm_simplify_ranges, name="simplify ranges"))
ret.append(RewriteStep(pm_reduce_simplify, name="simplify reduces"))
ret.append(RewriteStep(pm_postrange_opt, ctx=lambda _: opts, name="post optimize ast"))
# ** expander (expand_rewrite) **
@ -75,7 +76,7 @@ def _get_rewrites_for_renderer(opts:Renderer, optimize:bool, linearizer:bool, _Q
ret.append(RewriteStep(sym+pm_pre_expander+expander, name="expander"))
# add locals
ret.append(RewriteStep(pm_add_buffers_local+rangeify_codegen, name="add local buffers"))
ret.append(RewriteStep(pm_add_buffers+rangeify_codegen, name="add local buffers"))
# ** devectorizer (full_graph_rewrite) **
# remove reduce
@ -94,7 +95,7 @@ def _get_rewrites_for_renderer(opts:Renderer, optimize:bool, linearizer:bool, _Q
extra_matcher = opts.extra_matcher if opts.extra_matcher is not None else PatternMatcher([])
# lower the index dtype to a concrete int
ret.append(RewriteStep(pm_lower_index_dtype+cast_folding+load_store_indexing, lambda _: opts.device, name="lower all index dtypes"))
ret.append(RewriteStep(load_store_indexing+pm_lower_index_dtype, lambda _: opts.device, name="lower all index dtypes"))
# optional pre matcher
if opts.pre_matcher is not None: ret.append(RewriteStep(opts.pre_matcher, name="pre_matcher"))

Some files were not shown because too many files have changed in this diff Show more