update for 5.0

2026-06-24 02:14:17 +00:00 · 2025-04-28 14:32:08 -04:00 · 2025-04-28 14:32:08 -04:00 · 12c6573cf6
commit 12c6573cf6
parent edd4fa3b36
16 changed files with 92 additions and 265 deletions
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_1xMI300X/dev_beam.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." AMD=1
-export MODEL="bert"
-export DEFAULT_FLOAT="HALF" GPUS=1 BS=128 EVAL_BS=128
-
-export BEAM=3 BEAM_UOPS_MAX=4000 BEAM_UPCAST_MAX=256 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5
-export IGNORE_JIT_FIRST_BEAM=1
-# export BEAM_LOG_SURPASS_MAX=1
-# export BASEDIR="/raid/datasets/wiki"
-
-export RESET_STEP=1
-export BENCHMARK=10 BERT_LAYERS=2 DEBUG=2
-
-python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/README.md
@ -0,0 +1,69 @@
+# 1. Problem
+
+This problem uses BERT for NLP.
+
+## Requirements
+
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
+```
+git clone https://github.com/tinygrad/tinygrad.git
+python3 -m pip install -e ".[mlperf]"
+```
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
+```
+pip install gdown numpy tqdm tensorflow
+```
+
+### tinybox_green
+Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
+This is the default on production tinybox green.
+
+# 2. Directions
+
+## Steps to download and verify data
+
+### 1. Download raw data
+
+```
+BASEDIR="/raid/datasets/wiki" WIKI_TRAIN=1 VERIFY_CHECKSUM=1 python3 extra/datasets/wikipedia_download.py
+```
+
+### 2. Preprocess train and validation data
+
+Note: The number of threads used for preprocessing is limited by available memory. With 128GB of RAM, a maximum of 16 threads is recommended. 
+
+#### Training:
+```
+BASEDIR="/raid/datasets/wiki" NUM_WORKERS=16 python3 extra/datasets/wikipedia.py pre-train all
+```
+
+Generating a specific topic (Between 0 and 499)
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-train 42
+```
+
+#### Validation:
+```
+BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval
+```
+## Running
+
+### tinybox_green
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+```
+
+### tinybox_red
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
+```
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/README.md
@ -4,14 +4,14 @@ This problem uses BERT for NLP.

 ## Requirements

-Install tinygrad and mlperf-logging from master.
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
 ```
-Also install tqdm and tensorflow.
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
 ```
-pip install tqdm tensorflow
+pip install gdown numpy tqdm tensorflow
 ```

 ### tinybox_green
@ -52,12 +52,18 @@ BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval

 #### Steps to run benchmark
 ```
-examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
 ```

 ### tinybox_red

 #### Steps to run benchmark
 ```
-examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
 ```
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/README.md
@ -4,14 +4,14 @@ This problem uses BERT for NLP.

 ## Requirements

-Install tinygrad and mlperf-logging from master.
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
 ```
-Also install tqdm and tensorflow.
+Also install gdown (for dataset), numpy, tqdm and tensorflow.
 ```
-pip install tqdm tensorflow
+pip install gdown numpy tqdm tensorflow
 ```

 ### tinybox_green
@ -52,12 +52,18 @@ BASEDIR="/raid/datasets/wiki" python3 extra/datasets/wikipedia.py pre-eval

 #### Steps to run benchmark
 ```
-examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_green/run_and_time.sh
 ```

 ### tinybox_red

 #### Steps to run benchmark
 ```
-examples/mlperf/training_submission_v4.1/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_red/run_and_time.sh
+```
+### tinybox_8xMI300X
+
+#### Steps to run benchmark
+```
+examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/bert/implementations/tinybox_8xMI300X/run_and_time.sh
 ```
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/README.md
@ -1,50 +0,0 @@
-# 1. Problem
-
-This problem uses the ResNet-50 CNN to do image classification.
-
-## Requirements
-
-Install tinygrad and mlperf-logging from master.
-```
-git clone https://github.com/tinygrad/tinygrad.git
-python3 -m pip install -e ".[mlperf]"
-```
-
-### tinybox_green
-Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
-This is the default on production tinybox green.
-
-### tinybox_red
-Disable cwsr
-This is the default on production tinybox red.
-```
-sudo vi /etc/modprobe.d/amdgpu.conf
-cat <<EOF > /etc/modprobe.d/amdgpu.conf
-options amdgpu cwsr_enable=0
-EOF
-sudo update-initramfs -u
-sudo reboot
-
-# validate
-sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
-```
-
-# 2. Directions
-
-## Steps to download and verify data
-
-```
-IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
-```
-
-## Steps for one time setup
-
-### tinybox_red
-```
-examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
-```
-
-## Steps to run benchmark
-```
-examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
-```
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_beam.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." NV=1
-export MODEL="resnet"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
-
-export RESET_STEP=0
-
-export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
-
-export BENCHMARK=10 DEBUG=2
-
-python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/dev_run.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." NV=1
-export MODEL="resnet"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
-
-export RESET_STEP=0
-
-export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
-
-export EVAL_START_EPOCH=3 EVAL_FREQ=4
-
-export WANDB=1 PARALLEL=0
-
-python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_green/run_and_time.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." NV=1
-export MODEL="resnet"
-export SUBMISSION_PLATFORM="tinybox_green"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
-
-export RESET_STEP=0
-
-export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=10 BEAM_PADTO=0
-
-# pip install -e ".[mlperf]"
-export LOGMLPERF=1
-
-export SEED=$RANDOM
-DATETIME=$(date "+%m%d%H%M")
-LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
-
-# init
-BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
-
-# run
-PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/README.md
@ -1,50 +0,0 @@
-# 1. Problem
-
-This problem uses the ResNet-50 CNN to do image classification.
-
-## Requirements
-
-Install tinygrad and mlperf-logging from master.
-```
-git clone https://github.com/tinygrad/tinygrad.git
-python3 -m pip install -e ".[mlperf]"
-```
-
-### tinybox_green
-Install the p2p driver per [README](https://github.com/tinygrad/open-gpu-kernel-modules/blob/550.54.15-p2p/README.md)
-This is the default on production tinybox green.
-
-### tinybox_red
-Disable cwsr
-This is the default on production tinybox red.
-```
-sudo vi /etc/modprobe.d/amdgpu.conf
-cat <<EOF > /etc/modprobe.d/amdgpu.conf
-options amdgpu cwsr_enable=0
-EOF
-sudo update-initramfs -u
-sudo reboot
-
-# validate
-sudo cat /sys/module/amdgpu/parameters/cwsr_enable #= 0
-```
-
-# 2. Directions
-
-## Steps to download and verify data
-
-```
-IMGNET_TRAIN=1 python3 extra/datasets/imagenet_download.py
-```
-
-## Steps for one time setup
-
-### tinybox_red
-```
-examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
-```
-
-## Steps to run benchmark
-```
-examples/mlperf/training_submission_v4.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
-```
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_beam.sh
@ -1,13 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." AMD=1
-export MODEL="resnet"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
-
-export RESET_STEP=0
-
-export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
-
-export BENCHMARK=10 DEBUG=2
-
-python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/dev_run.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." AMD=1
-export MODEL="resnet"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
-
-export RESET_STEP=0
-
-export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
-
-export EVAL_START_EPOCH=3 EVAL_FREQ=4
-
-export WANDB=1 PARALLEL=0
-
-python3 examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/run_and_time.sh
@ -1,23 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." AMD=1
-export MODEL="resnet"
-export SUBMISSION_PLATFORM="tinybox_red"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=1536 EVAL_BS=192
-
-export RESET_STEP=0
-
-export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=2000 BEAM_UPCAST_MAX=96 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
-
-# pip install -e ".[mlperf]"
-export LOGMLPERF=1
-
-export SEED=$RANDOM
-DATETIME=$(date "+%m%d%H%M")
-LOGFILE="resnet_red_${DATETIME}_${SEED}.log"
-
-# init
-BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
-
-# run
-PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/resnet/implementations/tinybox_red/setup.sh
@ -1,8 +0,0 @@
-#!/bin/bash
-
-rocm-smi --setprofile compute
-rocm-smi --setmclk 3
-rocm-smi --setperflevel high
-
-# power cap to 350W
-echo "350000000" | sudo tee /sys/class/drm/card{1..6}/device/hwmon/hwmon*/power1_cap
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_green/README.md
@ -4,7 +4,7 @@ This problem uses RetinaNet for SSD.

 ## Requirements

-Install tinygrad and mlperf-logging from master.
+Install tinygrad and mlperf-logging (uncomment mlperf from setup.py) from branch mlperf_training_v5.0.
 ```
 git clone https://github.com/tinygrad/tinygrad.git
 python3 -m pip install -e ".[mlperf]"
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_beam.sh
@ -1,14 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." AMD=1
-export MODEL="retinanet"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
-export BASEDIR="/raid/datasets/openimages"
-
-# export RESET_STEP=0
-
-export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
-
-export BENCHMARK=5 DEBUG=2
-
-python examples/mlperf/model_train.py
--- a/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh
+++ b/examples/mlperf/training_submission_v5.0/tinycorp/benchmarks/retinanet/implementations/tinybox_red/dev_run.sh
@ -1,15 +0,0 @@
-#!/bin/bash
-
-export PYTHONPATH="." AMD=1
-export MODEL="retinanet"
-export DEFAULT_FLOAT="HALF" GPUS=6 BS=96 EVAL_BS=96
-export BASEDIR="/raid/datasets/openimages"
-
-# export RESET_STEP=0
-
-export TRAIN_BEAM=2 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=64 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=5 BEAM_PADTO=0
-
-export WANDB=1 PARALLEL=0
-export RUNMLPERF=1
-
-python examples/mlperf/model_train.py