mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
faster bert global_norm (#9901)
tinyamd 2% faster. also updated beam params that's 2-3% faster. update mlperf doc and steps too
This commit is contained in:
parent
91ccf1c343
commit
e8024c8281
13 changed files with 13 additions and 36 deletions
|
|
@ -820,12 +820,13 @@ def train_step_bert(model, optimizer, scheduler, loss_scaler:float, input_ids:Te
|
|||
loss = model.loss(lm_logits, seq_relationship_logits, masked_lm_ids, masked_lm_weights, next_sentence_labels)
|
||||
(loss * loss_scaler).backward()
|
||||
|
||||
global_norm = Tensor([0.0], dtype=dtypes.float32, device=optimizer[0].device).realize()
|
||||
global_norm = Tensor([0.0], dtype=dtypes.float32, device=optimizer[0].device)
|
||||
for p in optimizer.params:
|
||||
p.grad = p.grad / loss_scaler
|
||||
global_norm += p.grad.float().square().sum()
|
||||
global_norm = global_norm.sqrt()
|
||||
for p in optimizer.params: p.grad = (p.grad / Tensor.where(global_norm > 1.0, global_norm, 1.0)).cast(p.grad.dtype)
|
||||
global_norm = global_norm.sqrt().contiguous()
|
||||
for p in optimizer.params:
|
||||
p.grad = (global_norm > 1.0).where((p.grad/global_norm).cast(p.grad.dtype), p.grad)
|
||||
|
||||
optimizer.step()
|
||||
scheduler.step()
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ export MODEL="bert"
|
|||
export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
|
||||
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10
|
||||
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
|
|||
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
|
||||
export TRAIN_STEPS=3900
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10
|
||||
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ export DEFAULT_FLOAT="HALF" GPUS=8 BS=1024 EVAL_BS=1024
|
|||
export OPT_BASE_LEARNING_RATE=0.0011 OPT_LAMB_BETA_1=0.60466 OPT_LAMB_BETA_2=0.85437 DECAY=0.1
|
||||
export TRAIN_STEPS=3900
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10
|
||||
export BEAM=3 BEAM_UOPS_MAX=6000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1 FREE_INTERMEDIATE=0
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
|
|
|
|||
|
|
@ -61,12 +61,6 @@ examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementation
|
|||
|
||||
### tinybox_red
|
||||
|
||||
#### One time setup
|
||||
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh
|
||||
```
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ export PYTHONPATH="."
|
|||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export BEAM=7 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
export BEAM_LOG_SURPASS_MAX=1
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ export PYTHONPATH="."
|
|||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export BEAM=7 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ export MODEL="bert"
|
|||
export SUBMISSION_PLATFORM="tinybox_green"
|
||||
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export BEAM=7 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
|
|
|
|||
|
|
@ -18,10 +18,6 @@ pip install tqdm tensorflow
|
|||
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
|
||||
This is the default on production tinybox green.
|
||||
|
||||
### tinybox_red
|
||||
Disable cwsr + increase mes timeout.
|
||||
Install the custom amdgpu driver per [README](https://github.com/nimlgen/amdgpu_ubuntu_22_04/blob/v6.1.3/readme.md)
|
||||
|
||||
# 2. Directions
|
||||
|
||||
## Steps to download and verify data
|
||||
|
|
@ -61,12 +57,6 @@ examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementation
|
|||
|
||||
### tinybox_red
|
||||
|
||||
#### One time setup
|
||||
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/setup.sh
|
||||
```
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ export PYTHONPATH="."
|
|||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
export BEAM_LOG_SURPASS_MAX=1
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@ export PYTHONPATH="."
|
|||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ export MODEL="bert"
|
|||
export SUBMISSION_PLATFORM="tinybox_red"
|
||||
export DEFAULT_FLOAT="HALF" SUM_DTYPE="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export BEAM=3 BEAM_UOPS_MAX=10000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
|
|
|
|||
|
|
@ -1,8 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
rocm-smi --setprofile compute
|
||||
rocm-smi --setmclk 3
|
||||
rocm-smi --setperflevel high
|
||||
|
||||
# power cap to 350W
|
||||
# echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap
|
||||
Loading…
Add table
Add a link
Reference in a new issue