-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathtrain_local_single.sh
executable file
·96 lines (86 loc) · 2.7 KB
/
train_local_single.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env bash
##########################################################
# where to write tfevents
OUTPUT_DIR="gs://model-exports"
# experiment settings
TRAIN_BATCH=512
EVAL_BATCH=512
LR=0.001
EPOCHS=100
# create a job name for the this run
prefix="example"
now=$(date +"%Y%m%d_%H_%M_%S")
JOB_NAME="$ENV_NAME"-"$prefix"_"$now"
# locations locally or on the cloud for your files
TRAIN_FILES="data/train.tfrecords"
EVAL_FILES="data/val.tfrecords"
TEST_FILES="data/test.tfrecords"
##########################################################
if [[ -z $1 && -z $2 ]]; then
echo "Incorrect arguments specified."
echo ""
echo "Usage: ./train_local_single.sh <GPU_ID> [ENV_NAME]"
echo ""
exit 1
else
GPU_ID=$1
if [[ -z $2 ]]; then
ENV_NAME="default"
else
ENV_NAME=$2
fi
fi
if [[ -z $LD_LIBRARY_PATH || -z $CUDA_HOME ]]; then
echo ""
echo "CUDA environment variables not set."
echo "Consider adding them to your shell-rc."
echo ""
echo "Example:"
echo "----------------------------------------------------------"
echo 'LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64"'
echo 'LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/extras/CUPTI/lib64"'
echo 'CUDA_HOME="/usr/local/cuda"'
echo ""
fi
# needed to use virtualenvs
set -euo pipefail
# get current working directory
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
# create folders if they don't exist of logs and outputs
mkdir -p $DIR/runlogs
# create a local job directory for checkpoints etc
JOB_DIR=${OUTPUT_DIR}/${JOB_NAME}
###################
# Add notes to the log file based on the current information about this training job close vim to start training
# useful if you are running lots of different experiments and you forget what values you used
echo "---
## ${JOB_NAME}" >> training_log.md
echo "Learning Rate: ${LR}" >> training_log.md
echo "Epochs: ${EPOCHS}" >> training_log.md
echo "Batch Size (train/eval): ${TRAIN_BATCH}/ ${EVAL_BATCH}" >> training_log.md
echo "### Hypothesis
" >> training_log.md
echo "### Results
" >> training_log.md
vim + training_log.md
###################
# activate the virtual environment
if [[ -z $2 ]]; then
set +u
source $ENV_NAME/bin/activate
set -u
fi
# start training
CUDA_VISIBLE_DEVICES="$GPU_ID"
python3 -m initialisers.task \
--job-dir ${JOB_DIR} \
--train-batch-size ${BATCH} \
--eval-batch-size ${BATCH} \
--learning-rate ${LR} \
--num-epochs ${EPOCHS} \
--train-files ${TRAIN_FILES} \
--eval-files ${EVAL_FILES} \
--test-files ${TEST_FILES} \
--export-path "${OUTPUT_DIR}/exports" \
&>runlogs/$GPU_ID.log & echo "$!" > runlogs/$GPU_ID.pid
echo "Job launched."