resnet eval 4n+3 if epoch < 33 (#4391)

the rule is as thoroughly as 4n+k and we can stop the clock as soon as eval hits target. this can save 24 evals or 12 minutes
This commit is contained in:
chenyu 2024-05-02 16:52:07 -04:00 committed by GitHub
commit ab01a9433d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 9 additions and 4 deletions

View file

@ -77,7 +77,7 @@ def train_resnet():
target, achieved = getenv("TARGET", 0.759), False
eval_start_epoch = getenv("EVAL_START_EPOCH", 0)
eval_epochs = getenv("EVAL_EPOCHS", 1)
eval_freq = getenv("EVAL_FREQ", 1)
steps_in_train_epoch = config["steps_in_train_epoch"] = (len(get_train_files()) // BS)
steps_in_val_epoch = config["steps_in_val_epoch"] = (round_up(len(get_val_files()), EVAL_BS) // EVAL_BS)
@ -237,7 +237,8 @@ def train_resnet():
MLLOGGER.event(key=mllog_constants.EPOCH_STOP, value=e+1, metadata=dict(epoch_num=e+1))
# ** eval loop **
if (e + 1 - eval_start_epoch) % eval_epochs == 0 and steps_in_val_epoch > 0:
# always eval for epoch >= 33 to stop the clock as soon as eval target hits, it can converge in epoch in [33, 37]
if steps_in_val_epoch > 0 and ((e + 1 - eval_start_epoch) % eval_freq == 0 or e + 1 >= 33):
if MLLOGGER and RUNMLPERF:
MLLOGGER.start(key=mllog_constants.EVAL_START, value=e+1, metadata=dict(epoch_num=e+1))
if getenv("RESET_STEP", 1): train_step.reset() # free the train step memory :(

View file

@ -6,6 +6,8 @@ export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0
export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=50
export EVAL_START_EPOCH=3 EVAL_FREQ=4
export WANDB=1 PARALLEL=0
python3 examples/mlperf/model_train.py

View file

@ -20,4 +20,4 @@ LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
WANDB=1 PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
WANDB=1 PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE

View file

@ -6,6 +6,8 @@ export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=25
export EVAL_START_EPOCH=3 EVAL_FREQ=4
export WANDB=1 PARALLEL=0
python3 examples/mlperf/model_train.py

View file

@ -20,4 +20,4 @@ LOGFILE="resnet_red_${DATETIME}_${SEED}.log"
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
# run
WANDB=1 PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
WANDB=1 PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE