mirror of
https://github.com/tinygrad/tinygrad.git
synced 2026-06-24 02:14:17 +00:00
resnet eval 4n+3 if epoch < 33 (#4391)
the rule is as thoroughly as 4n+k and we can stop the clock as soon as eval hits target. this can save 24 evals or 12 minutes
This commit is contained in:
parent
7c8401fc65
commit
ab01a9433d
5 changed files with 9 additions and 4 deletions
|
|
@ -77,7 +77,7 @@ def train_resnet():
|
|||
|
||||
target, achieved = getenv("TARGET", 0.759), False
|
||||
eval_start_epoch = getenv("EVAL_START_EPOCH", 0)
|
||||
eval_epochs = getenv("EVAL_EPOCHS", 1)
|
||||
eval_freq = getenv("EVAL_FREQ", 1)
|
||||
|
||||
steps_in_train_epoch = config["steps_in_train_epoch"] = (len(get_train_files()) // BS)
|
||||
steps_in_val_epoch = config["steps_in_val_epoch"] = (round_up(len(get_val_files()), EVAL_BS) // EVAL_BS)
|
||||
|
|
@ -237,7 +237,8 @@ def train_resnet():
|
|||
MLLOGGER.event(key=mllog_constants.EPOCH_STOP, value=e+1, metadata=dict(epoch_num=e+1))
|
||||
|
||||
# ** eval loop **
|
||||
if (e + 1 - eval_start_epoch) % eval_epochs == 0 and steps_in_val_epoch > 0:
|
||||
# always eval for epoch >= 33 to stop the clock as soon as eval target hits, it can converge in epoch in [33, 37]
|
||||
if steps_in_val_epoch > 0 and ((e + 1 - eval_start_epoch) % eval_freq == 0 or e + 1 >= 33):
|
||||
if MLLOGGER and RUNMLPERF:
|
||||
MLLOGGER.start(key=mllog_constants.EVAL_START, value=e+1, metadata=dict(epoch_num=e+1))
|
||||
if getenv("RESET_STEP", 1): train_step.reset() # free the train step memory :(
|
||||
|
|
|
|||
|
|
@ -6,6 +6,8 @@ export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0
|
|||
|
||||
export TRAIN_BEAM=3 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=50
|
||||
|
||||
export EVAL_START_EPOCH=3 EVAL_FREQ=4
|
||||
|
||||
export WANDB=1 PARALLEL=0
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
|
|||
|
|
@ -20,4 +20,4 @@ LOGFILE="resnet_green_${DATETIME}_${SEED}.log"
|
|||
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
|
||||
|
||||
# run
|
||||
WANDB=1 PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||
WANDB=1 PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||
|
|
@ -6,6 +6,8 @@ export SPLIT_REDUCEOP=1 LAZYCACHE=0 RESET_STEP=0
|
|||
|
||||
export TRAIN_BEAM=4 IGNORE_JIT_FIRST_BEAM=1 BEAM_UOPS_MAX=1500 BEAM_UPCAST_MAX=128 BEAM_LOCAL_MAX=1024 BEAM_MIN_PROGRESS=25
|
||||
|
||||
export EVAL_START_EPOCH=3 EVAL_FREQ=4
|
||||
|
||||
export WANDB=1 PARALLEL=0
|
||||
|
||||
python3 examples/mlperf/model_train.py
|
||||
|
|
|
|||
|
|
@ -20,4 +20,4 @@ LOGFILE="resnet_red_${DATETIME}_${SEED}.log"
|
|||
BENCHMARK=10 INITMLPERF=1 python3 examples/mlperf/model_train.py | tee $LOGFILE
|
||||
|
||||
# run
|
||||
WANDB=1 PARALLEL=0 RUNMLPERF=1 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||
WANDB=1 PARALLEL=0 RUNMLPERF=1 EVAL_START_EPOCH=3 EVAL_FREQ=4 python3 examples/mlperf/model_train.py | tee -a $LOGFILE
|
||||
Loading…
Add table
Add a link
Reference in a new issue