mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
update for 5.0
This commit is contained in:
parent
edd4fa3b36
commit
12c6573cf6
16 changed files with 92 additions and 265 deletions
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." AMD=1
|
||||
export MODEL="bert"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128
|
||||
|
||||
export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
|
||||
export IGNORE_JIT_FIRST_BEAM=1
|
||||
# export BEAM_LOG_SURPASS_MAX=1
|
||||
# export BASEDIR="/raid/datasets/wiki"
|
||||
|
||||
export RESET_STEP=1
|
||||
export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
@ -0,0 +1,69 @@
|
|||
# 1. Problem
|
||||
|
||||
This problem uses BERT for NLP.
|
||||
|
||||
## Requirements
|
||||
|
||||
Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
|
||||
```
|
||||
git clone https://github.com/tinygrad/tinygrad.git
|
||||
python3 -m pip install -e ".[mlperf]"
|
||||
```
|
||||
Also install gdown (for dataset), numpy, tqdm and tensorflow.
|
||||
```
|
||||
pip install gdown numpy tqdm tensorflow
|
||||
```
|
||||
|
||||
### tinybox_green
|
||||
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
|
||||
This is the default on production tinybox green.
|
||||
|
||||
# 2. Directions
|
||||
|
||||
## Steps to download and verify data
|
||||
|
||||
### 1. Download raw data
|
||||
|
||||
```
|
||||
BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
|
||||
```
|
||||
|
||||
### 2. Preprocess train and validation data
|
||||
|
||||
Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended.
|
||||
|
||||
#### Training:
|
||||
```
|
||||
BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
|
||||
```
|
||||
|
||||
Generating a specific topic (Between 0 and 499)
|
||||
```
|
||||
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
|
||||
```
|
||||
|
||||
#### Validation:
|
||||
```
|
||||
BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
|
||||
```
|
||||
## Running
|
||||
|
||||
### tinybox_green
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
|
||||
```
|
||||
|
||||
### tinybox_red
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
|
||||
```
|
||||
### tinybox_8xMI300X
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
|
||||
```
|
||||
|
|
@ -4,14 +4,14 @@ This problem uses BERT for NLP.
|
|||
|
||||
## Requirements
|
||||
|
||||
Install tinygrad and mlperf-logging from master.
|
||||
Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
|
||||
```
|
||||
git clone https://github.com/tinygrad/tinygrad.git
|
||||
python3 -m pip install -e ".[mlperf]"
|
||||
```
|
||||
Also install tqdm and tensorflow.
|
||||
Also install gdown (for dataset), numpy, tqdm and tensorflow.
|
||||
```
|
||||
pip install tqdm tensorflow
|
||||
pip install gdown numpy tqdm tensorflow
|
||||
```
|
||||
|
||||
### tinybox_green
|
||||
|
|
@ -52,12 +52,18 @@ BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
|
|||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
|
||||
```
|
||||
|
||||
### tinybox_red
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
|
||||
```
|
||||
### tinybox_8xMI300X
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
|
||||
```
|
||||
|
|
@ -4,14 +4,14 @@ This problem uses BERT for NLP.
|
|||
|
||||
## Requirements
|
||||
|
||||
Install tinygrad and mlperf-logging from master.
|
||||
Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
|
||||
```
|
||||
git clone https://github.com/tinygrad/tinygrad.git
|
||||
python3 -m pip install -e ".[mlperf]"
|
||||
```
|
||||
Also install tqdm and tensorflow.
|
||||
Also install gdown (for dataset), numpy, tqdm and tensorflow.
|
||||
```
|
||||
pip install tqdm tensorflow
|
||||
pip install gdown numpy tqdm tensorflow
|
||||
```
|
||||
|
||||
### tinybox_green
|
||||
|
|
@ -52,12 +52,18 @@ BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
|
|||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
|
||||
```
|
||||
|
||||
### tinybox_red
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
|
||||
```
|
||||
### tinybox_8xMI300X
|
||||
|
||||
#### Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
|
||||
```
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
# 1. Problem
|
||||
|
||||
This problem uses the ResNet-50 CNN to do image classification.
|
||||
|
||||
## Requirements
|
||||
|
||||
Install tinygrad and mlperf-logging from master.
|
||||
```
|
||||
git clone https://github.com/tinygrad/tinygrad.git
|
||||
python3 -m pip install -e ".[mlperf]"
|
||||
```
|
||||
|
||||
### tinybox_green
|
||||
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
|
||||
This is the default on production tinybox green.
|
||||
|
||||
### tinybox_red
|
||||
Disable cwsr
|
||||
This is the default on production tinybox red.
|
||||
```
|
||||
sudo vi /etc/modprobe.d/amdgpu.conf
|
||||
cat <<EOF > /etc/modprobe.d/amdgpu.conf
|
||||
options amdgpu cwsr_enable=0
|
||||
EOF
|
||||
sudo update-initramfs -u
|
||||
sudo reboot
|
||||
|
||||
# validate
|
||||
sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
|
||||
```
|
||||
|
||||
# 2. Directions
|
||||
|
||||
## Steps to download and verify data
|
||||
|
||||
```
|
||||
IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
|
||||
```
|
||||
|
||||
## Steps for one time setup
|
||||
|
||||
### tinybox_red
|
||||
```
|
||||
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
|
||||
```
|
||||
|
||||
## Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
|
||||
```
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." NV=1
|
||||
export MODEL="resnet"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||
|
||||
export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
|
||||
|
||||
export BENCHMARK=10 DEBUG=2
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." NV=1
|
||||
export MODEL="resnet"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||
|
||||
export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
|
||||
|
||||
export EVAL_START_EPOCH=3 EVAL_FREQ=4
|
||||
|
||||
export WANDB=1 PARALLEL=0
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." NV=1
|
||||
export MODEL="resnet"
|
||||
export SUBMISSION_PLATFORM="tinybox_green"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||
|
||||
export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
|
||||
|
||||
# pip install -e ".[mlperf]"
|
||||
export LOGMLPERF=1
|
||||
|
||||
export SEED=$RANDOM
|
||||
DATETIME=$(date "+%m%d%H%M")
|
||||
LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
|
||||
|
||||
# init
|
||||
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
|
||||
|
||||
# run
|
||||
PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||
|
|
@ -1,50 +0,0 @@
|
|||
# 1. Problem
|
||||
|
||||
This problem uses the ResNet-50 CNN to do image classification.
|
||||
|
||||
## Requirements
|
||||
|
||||
Install tinygrad and mlperf-logging from master.
|
||||
```
|
||||
git clone https://github.com/tinygrad/tinygrad.git
|
||||
python3 -m pip install -e ".[mlperf]"
|
||||
```
|
||||
|
||||
### tinybox_green
|
||||
Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
|
||||
This is the default on production tinybox green.
|
||||
|
||||
### tinybox_red
|
||||
Disable cwsr
|
||||
This is the default on production tinybox red.
|
||||
```
|
||||
sudo vi /etc/modprobe.d/amdgpu.conf
|
||||
cat <<EOF > /etc/modprobe.d/amdgpu.conf
|
||||
options amdgpu cwsr_enable=0
|
||||
EOF
|
||||
sudo update-initramfs -u
|
||||
sudo reboot
|
||||
|
||||
# validate
|
||||
sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
|
||||
```
|
||||
|
||||
# 2. Directions
|
||||
|
||||
## Steps to download and verify data
|
||||
|
||||
```
|
||||
IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
|
||||
```
|
||||
|
||||
## Steps for one time setup
|
||||
|
||||
### tinybox_red
|
||||
```
|
||||
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
|
||||
```
|
||||
|
||||
## Steps to run benchmark
|
||||
```
|
||||
examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
|
||||
```
|
||||
|
|
@ -1,13 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." AMD=1
|
||||
export MODEL="resnet"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||
|
||||
export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||
|
||||
export BENCHMARK=10 DEBUG=2
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." AMD=1
|
||||
export MODEL="resnet"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||
|
||||
export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||
|
||||
export EVAL_START_EPOCH=3 EVAL_FREQ=4
|
||||
|
||||
export WANDB=1 PARALLEL=0
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." AMD=1
|
||||
export MODEL="resnet"
|
||||
export SUBMISSION_PLATFORM="tinybox_red"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
|
||||
|
||||
export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||
|
||||
# pip install -e ".[mlperf]"
|
||||
export LOGMLPERF=1
|
||||
|
||||
export SEED=$RANDOM
|
||||
DATETIME=$(date "+%m%d%H%M")
|
||||
LOGFILE="resnet_red_${DATETIME}_${SEED}.log"
|
||||
|
||||
# init
|
||||
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
|
||||
|
||||
# run
|
||||
PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
rocm-smi --setprofile compute
|
||||
rocm-smi --setmclk 3
|
||||
rocm-smi --setperflevel high
|
||||
|
||||
# power cap to 350W
|
||||
echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap
|
||||
|
|
@ -4,7 +4,7 @@ This problem uses RetinaNet for SSD.
|
|||
|
||||
## Requirements
|
||||
|
||||
Install tinygrad and mlperf-logging from master.
|
||||
Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
|
||||
```
|
||||
git clone https://github.com/tinygrad/tinygrad.git
|
||||
python3 -m pip install -e ".[mlperf]"
|
||||
|
|
|
|||
|
|
@ -1,14 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." AMD=1
|
||||
export MODEL="retinanet"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
export BASEDIR="/raid/datasets/openimages"
|
||||
|
||||
# export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||
|
||||
export BENCHMARK=5 DEBUG=2
|
||||
|
||||
python examples/mlperf/model_train.py
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
#!/bin/bash
|
||||
|
||||
export PYTHONPATH="." AMD=1
|
||||
export MODEL="retinanet"
|
||||
export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
|
||||
export BASEDIR="/raid/datasets/openimages"
|
||||
|
||||
# export RESET_STEP=0
|
||||
|
||||
export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
|
||||
|
||||
export WANDB=1 PARALLEL=0
|
||||
export RUNMLPERF=1
|
||||
|
||||
python examples/mlperf/model_train.py
|
||||
Loading…
Add table
Add a link
Reference in a new issue