Merge branch 'master' into dsp_search_merged

2026-06-24 02:14:17 +00:00 · 2025-04-02 15:29:42 +08:00 · 2025-04-02 15:29:42 +08:00 · efad1ebd0d
commit efad1ebd0d
parent e20eed6208 4514fd91c1
4 changed files with 17 additions and 12 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -401,8 +401,9 @@ jobs:
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama.py --gen 1 --size 7B --shard 6 --prompt "Hello." --count 10 --temperature 0  --timing | tee llama_six_gpu.txt
    - name: Run LLaMA-3 8B BEAM
      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 python3 examples/llama3.py --size 8B --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_beam.txt
-    - name: Run LLaMA-3 8B on 4 GPUs with BEAM
-      run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
+    # TODO: device hangs
+    # - name: Run LLaMA-3 8B on 4 GPUs with BEAM
+    #   run: AMD=1 JITBEAM=2 IGNORE_BEAM_CACHE=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 4 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_four_gpu.txt
    # - name: Run LLaMA-3 8B on 6 GPUs
    #   run: AMD=1 CAPTURE_PROCESS_REPLAY=0 python3 examples/llama3.py --size 8B --shard 6 --model weights/LLaMA-3/8B-SF-DPO/ --benchmark --temperature 0 | tee llama3_six_gpu.txt
    - name: Restore amdgpu
--- a/docs/env_vars.md
+++ b/docs/env_vars.md
@ -30,7 +30,7 @@ These control the behavior of core tinygrad even when used as a library.

 Variable | Possible Value(s) | Description
 ---|---|---
-DEBUG               | [1-6]      | enable debugging output, with 4 you get operations, timings, speed, generated code and more
+DEBUG               | [1-7]      | enable debugging output (operations, timings, speed, generated code and more)
 GPU                 | [1]        | enable the GPU (OpenCL) backend
 CUDA                | [1]        | enable CUDA backend
 AMD                 | [1]        | enable AMD backend
@ -49,4 +49,16 @@ VISIBLE_DEVICES     | [list[int]]| restricts the NV/AMD devices that are availab
 JIT                 | [0-2]      | 0=disabled, 1=[jit enabled](quickstart.md#jit) (default), 2=jit enabled, but graphs are disabled
 VIZ                 | [1]        | 0=disabled, 1=[viz enabled](https://github.com/tinygrad/tinygrad/tree/master/tinygrad/viz)
 ALLOW_TF32          | [1]        | enable TensorFloat-32 tensor cores on Ampere or newer GPUs.
-WEBGPU_BACKEND      | [WGPUBackendType_Metal, ...]          | Force select a backend for WebGPU (Metal, DirectX, OpenGL, Vulkan...)
+WEBGPU_BACKEND      | [WGPUBackendType_Metal, ...]          | Force select a backend for WebGPU (Metal, DirectX, OpenGL, Vulkan...)
+
+## Debug breakdown
+
+Variable | Value | Description
+---|---|---
+DEBUG               | >= 1       | Enables debugging and lists devices being used
+DEBUG               | >= 2       | Provides performance metrics for operations, including timing, memory usage, bandwidth for each kernel execution
+DEBUG               | >= 3       | Outputs buffers used for each kernel (shape, dtype and strides) and the applied optimizations at a kernel level
+DEBUG               | >= 4       | Outputs the generated kernel code
+DEBUG               | >= 5       | Displays the intermediate representation of the computation UOps (AST)
+DEBUG               | >= 6       | Displays the intermediate representation of the computation UOps in a linearized manner, detailing the operation sequence
+DEBUG               | >= 7       | Outputs the assembly code generated for the target hardware
--- a/examples/benchmark_onnx.py
+++ b/examples/benchmark_onnx.py
@ -34,8 +34,3 @@ if __name__ == "__main__":
  if getenv("ORT"):
    validate(onnx_file, new_inputs, rtol=1e-3, atol=1e-3)
    print("model validated")
-
-  if (fn:=getenv("SAVE_PKL", "")) != "":
-    with open(fn, "wb") as f:
-      pickle.dump(run_onnx_jit, f)
-    print(f"pkl saved to {fn}")
--- a/tinygrad/codegen/symbolic.py
+++ b/tinygrad/codegen/symbolic.py
@ -198,9 +198,6 @@ commutative = PatternMatcher([
 ])

 symbolic = symbolic_simple+PatternMatcher([
-  # ** COMMUTATIVE flipping (only for ints) **
-  # NOTE: this can break merging vector math by only flipping some of them
-  #(UPat(GroupOp.Commutative, dtype=dtypes.int, name='x'), lambda x: x.replace(src=x.src[::-1]) if x.src[1].tuplize < x.src[0].tuplize else None),
  # ** boolean algebra **
  (UPat.var("x") | (UPat.var("x") & UPat.var()), lambda x: x), # x|(x&y) -> x
  # ** combine terms **