mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
Merge branch 'master' into dsp_search_merged
This commit is contained in:
commit
efad1ebd0d
4 changed files with 17 additions and 12 deletions
5
.github/workflows/benchmark.yml
vendored
5
.github/workflows/benchmark.yml
vendored
|
|
@ -401,8 +401,9 @@ jobs:
|
|||
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0 --timing | tee llama_six_gpu.txt
|
||||
- name: Run LLaMA-3 8B BEAM
|
||||
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
|
||||
- name: Run LLaMA-3 8B on 4 GPUs with BEAM
|
||||
run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
||||
# TODO: device hangs
|
||||
# - name: Run LLaMA-3 8B on 4 GPUs with BEAM
|
||||
# run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
|
||||
# - name: Run LLaMA-3 8B on 6 GPUs
|
||||
# run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
|
||||
- name: Restore amdgpu
|
||||
|
|
|
|||
|
|
@ -30,7 +30,7 @@ These control the behavior of core tinygrad even when used as a library.
|
|||
|
||||
Variable | Possible Value(s) | Description
|
||||
---|---|---
|
||||
DEBUG | [1-6] | enable debugging output, with 4 you get operations, timings, speed, generated code and more
|
||||
DEBUG | [1-7] | enable debugging output (operations, timings, speed, generated code and more)
|
||||
GPU | [1] | enable the GPU (OpenCL) backend
|
||||
CUDA | [1] | enable CUDA backend
|
||||
AMD | [1] | enable AMD backend
|
||||
|
|
@ -49,4 +49,16 @@ VISIBLE_DEVICES | [list[int]]| restricts the NV/AMD devices that are availab
|
|||
JIT | [0-2] | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled
|
||||
VIZ | [1] | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz)
|
||||
ALLOW_TF32 | [1] | enable TensorFloat-32 tensor cores on Ampere or newer GPUs.
|
||||
WEBGPU_BACKEND | [WGPUBackendType_Metal, ...] | Force select a backend for WebGPU (Metal, DirectX, OpenGL, Vulkan...)
|
||||
WEBGPU_BACKEND | [WGPUBackendType_Metal, ...] | Force select a backend for WebGPU (Metal, DirectX, OpenGL, Vulkan...)
|
||||
|
||||
## Debug breakdown
|
||||
|
||||
Variable | Value | Description
|
||||
---|---|---
|
||||
DEBUG | >= 1 | Enables debugging and lists devices being used
|
||||
DEBUG | >= 2 | Provides performance metrics for operations, including timing, memory usage, bandwidth for each kernel execution
|
||||
DEBUG | >= 3 | Outputs buffers used for each kernel (shape, dtype and strides) and the applied optimizations at a kernel level
|
||||
DEBUG | >= 4 | Outputs the generated kernel code
|
||||
DEBUG | >= 5 | Displays the intermediate representation of the computation UOps (AST)
|
||||
DEBUG | >= 6 | Displays the intermediate representation of the computation UOps in a linearized manner, detailing the operation sequence
|
||||
DEBUG | >= 7 | Outputs the assembly code generated for the target hardware
|
||||
|
|
|
|||
|
|
@ -34,8 +34,3 @@ if __name__ == "__main__":
|
|||
if getenv("ORT"):
|
||||
validate(onnx_file, new_inputs, rtol=1e-3, atol=1e-3)
|
||||
print("model validated")
|
||||
|
||||
if (fn:=getenv("SAVE_PKL", "")) != "":
|
||||
with open(fn, "wb") as f:
|
||||
pickle.dump(run_onnx_jit, f)
|
||||
print(f"pkl saved to {fn}")
|
||||
|
|
|
|||
|
|
@ -198,9 +198,6 @@ commutative = PatternMatcher([
|
|||
])
|
||||
|
||||
symbolic = symbolic_simple+PatternMatcher([
|
||||
# ** COMMUTATIVE flipping (only for ints) **
|
||||
# NOTE: this can break merging vector math by only flipping some of them
|
||||
#(UPat(GroupOp.Commutative, dtype=dtypes.int, name='x'), lambda x: x.replace(src=x.src[::-1]) if x.src[1].tuplize < x.src[0].tuplize else None),
|
||||
# ** boolean algebra **
|
||||
(UPat.var("x") | (UPat.var("x") & UPat.var()), lambda x: x), # x|(x&y) -> x
|
||||
# ** combine terms **
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue