forked from microsoft/SuperScaler
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmegatron_gpt_execute.sh
143 lines (119 loc) · 5.17 KB
/
megatron_gpt_execute.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#! /bin/bash
ROOT_PATH=$(pwd)
exp_setting=$1
model_name=gpt
if [ "$exp_setting" == "small" ]; then
cd $ROOT_PATH/external/Megatron-LM
#### Model info ####
model_size=1_3B
#### Hardware info ####
NNODES=1
GPUS_PER_NODE=4
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
#### Distributed info ####
## Modify this for distributed training
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=7000
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
#### Paths ####
RESULT_PATH=${ROOT_PATH}/logs/megatron/
LOG_PATH=${RESULT_PATH}runtime/${model_name}/${model_size}/
CONFIG_SAVE_PATH=${RESULT_PATH}configs/${model_name}/${model_size}/
mkdir -p ${LOG_PATH}csv
for file_name in $(ls $CONFIG_SAVE_PATH)
do
config_name=`basename $file_name .json`
CURRENT_TIME=$(date '+%Y-%m-%d-%H-%M-%S')
echo "[LOG][RUNTIME]($(date '+%Y-%m-%d-%H-%M-%S')) start executing config: $config_name ." >> ${RESULT_PATH}full_log.log
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt.py \
--config-file $CONFIG_SAVE_PATH${file_name} \
--train-iters 3 \
--eval-iters 0 \
--lr-decay-iters 320000 \
--vocab-file ${ROOT_PATH}/runtime/vocabs/gpt2-vocab.json \
--merge-file ${ROOT_PATH}/runtime/vocabs/gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--log-interval 1 \
--DDP-impl local \
--fp16 \
--log-path $LOG_PATH \
2>&1 | tee ${LOG_PATH}full_log_${config_name}_rank${NODE_RANK}_${CURRENT_TIME}
echo "[LOG][RUNTIME]($(date '+%Y-%m-%d-%H-%M-%S')) end executing config: $config_name ." >> ${RESULT_PATH}full_log.log
done
elif [ "$exp_setting" == "large" ]; then
## Paths
RESULT_PATH=${ROOT_PATH}/logs-large/megatron/
## 1node (4GPUs and 8GPUs)
bash scripts/megatron_dist_scripts/run_${model_name}_1node.sh
## 2nodes
parallel-ssh -i -t 0 -h pssh-2workers.host "ps -aux | grep 'pretrain' | grep -v grep | awk '{print \$2}' | xargs kill -9"
parallel-ssh -i -t 0 -h pssh-2workers.host "cd $ROOT_PATH && bash scripts/megatron_dist_scripts/run_${model_name}_2nodes.sh"
## 4nodes
parallel-ssh -i -t 0 -h pssh-4workers.host "ps -aux | grep 'pretrain' | grep -v grep | awk '{print \$2}' | xargs kill -9"
parallel-ssh -i -t 0 -h pssh-4workers.host "cd $ROOT_PATH && bash scripts/megatron_dist_scripts/run_${model_name}_4nodes.sh"
elif [ "$exp_setting" == "scale" ]; then
cd $ROOT_PATH/external/Megatron-LM
#### Model info ####
model_name=scale-layer
#### Hardware info ####
NNODES=1
GPUS_PER_NODE=8
WORLD_SIZE=$(($GPUS_PER_NODE*$NNODES))
#### Distributed info ####
## Modify this for distributed training
NODE_RANK=0
MASTER_ADDR=localhost
MASTER_PORT=7000
DISTRIBUTED_ARGS="--nproc_per_node $GPUS_PER_NODE --nnodes $NNODES --node_rank $NODE_RANK --master_addr $MASTER_ADDR --master_port $MASTER_PORT"
#### Paths ####
RESULT_PATH=${ROOT_PATH}/logs-large/megatron/
num_layers_list=(8 16 32 64 128 256 512 1024)
for ((index=0; index<8; index=index+1))
do
num_layers=${num_layers_list[$index]}
model_size=${num_layers}layers
LOG_PATH=${RESULT_PATH}runtime/${model_name}/${model_size}/
CONFIG_SAVE_PATH=${RESULT_PATH}configs/${model_name}/${model_size}/
mkdir -p ${LOG_PATH}csv
for file_name in $(ls $CONFIG_SAVE_PATH)
do
config_name=`basename $file_name .json`
CURRENT_TIME=$(date '+%Y-%m-%d-%H-%M-%S')
echo "[LOG][RUNTIME]($(date '+%Y-%m-%d-%H-%M-%S')) start executing config: $config_name ." >> ${RESULT_PATH}full_log.log
python -m torch.distributed.launch $DISTRIBUTED_ARGS \
pretrain_gpt.py \
--config-file $CONFIG_SAVE_PATH${file_name} \
--train-iters 3 \
--eval-iters 0 \
--lr-decay-iters 320000 \
--vocab-file ${ROOT_PATH}/runtime/vocabs/gpt2-vocab.json \
--merge-file ${ROOT_PATH}/runtime/vocabs/gpt2-merges.txt \
--data-impl mmap \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--lr-decay-style cosine \
--min-lr 1.0e-5 \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--lr-warmup-fraction .01 \
--log-interval 1 \
--DDP-impl local \
--fp16 \
--log-path $LOG_PATH \
2>&1 | tee ${LOG_PATH}full_log_${config_name}_rank${NODE_RANK}_${CURRENT_TIME}
echo "[LOG][RUNTIME]($(date '+%Y-%m-%d-%H-%M-%S')) end executing config: $config_name ." >> ${RESULT_PATH}full_log.log
done
done
fi
python3 $ROOT_PATH/runtime/scripts/show_best_perf.py $model_name $RESULT_PATH 2>&1 | tee -a ${RESULT_PATH}full_log.log