-
Notifications
You must be signed in to change notification settings - Fork 40
/
Copy pathtrain.py
executable file
·206 lines (172 loc) · 9.03 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import os
import argparse
import time
import tqdm
import cv2
import matplotlib.pylab as plt
import torch.cuda
import numpy as np
import torch.nn as nn
import torch.optim as optim
from config.config import GetConfig, COCOSourceConfig, TrainingOpt
from data.mydataset import MyDataset
from torch.utils.data import DataLoader
from models.posenet import PoseNet
from models.loss_model import MultiTaskLoss
import warnings
os.environ['CUDA_VISIBLE_DEVICES'] = "0, 1, 2, 3" # choose the available GPUs
warnings.filterwarnings("ignore")
parser = argparse.ArgumentParser(description='PoseNet Training')
parser.add_argument('--resume', '-r', action='store_true', default=False, help='resume from checkpoint')
args = parser.parse_args()
opt = TrainingOpt()
config = GetConfig(opt.config_name)
soureconfig = COCOSourceConfig(opt.hdf5_train_data)
train_data = MyDataset(config, soureconfig, shuffle=False, augment=True) # shuffle in data loader
train_loader = DataLoader(train_data, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=16,
pin_memory=True) # num_workers is tuned according to project, too big or small is not good.
soureconfig_val = COCOSourceConfig(opt.hdf5_val_data)
val_data = MyDataset(config, soureconfig_val, shuffle=False, augment=True) # shuffle in data loader
val_loader = DataLoader(val_data, batch_size=opt.batch_size, shuffle=True, drop_last=True, num_workers=8,
pin_memory=True) # num_workers is tuned according to project, too big or small is not good.
# # ############# for debug ###################
# if __name__ == '__main__':
# t0 = time.time()
# count = 0
# print(torch.cuda.get_device_name(0))
# torch.backends.cudnn.benchmark = True
# for epoch in range(20):
# for bath_id, data_dict in enumerate(train_loader):
#
# t = data_dict[0].cuda(non_blocking=True) # , non_blocking=True
# count += opt.batch_size
# print(bath_id, ' of ', epoch)
# if count > 500:
# break
# print('**************** ', count / (time.time() - t0))
use_cuda = torch.cuda.is_available() # 判断GPU cuda是否可用
best_loss = float('inf')
start_epoch = 0 # 从0开始或者从上一个epoch开始
posenet = PoseNet(opt.nstack, opt.hourglass_inp_dim, config.num_layers, bn=False)
optimizer = optim.SGD(posenet.parameters(), lr=opt.learning_rate, momentum=0.9, weight_decay=1e-4)
if args.resume:
print('\nResuming from checkpoint ...... ')
checkpoint = torch.load(opt.ckpt_path, map_location=torch.device('cpu')) # map to cpu to save the gpu memory
posenet.load_state_dict(checkpoint['weights'])
print('\nNetwork weights have been resumed from checkpoint...')
optimizer.load_state_dict(checkpoint['optimizer_weight'])
# We must convert the resumed state data of optimizer to gpu
"""It is because the previous training was done on gpu, so when saving the optimizer.state_dict, the stored
states(tensors) are of cuda version. During resuming, when we load the saved optimizer, load_state_dict()
loads this cuda version to cpu. But in this project, we use map_location to map the state tensors to cpu.
In the training process, we need cuda version of state tensors, so we have to convert them to gpu."""
for state in optimizer.state.values():
for k, v in state.items():
if torch.is_tensor(v):
state[k] = v.cuda()
print('\nOptimizer has been resumed from checkpoint...')
best_loss = checkpoint['train_loss']
print('\n******************** Best loss resumed is :', best_loss, ' ************************')
start_epoch = checkpoint['epoch'] + 1
# else:
# print(' # Loading pretrained model # ')
# posenet.load_state_dict(torch.load(opt.pretrained_model))
criterion = MultiTaskLoss(opt, config)
if use_cuda:
posenet = torch.nn.parallel.DataParallel(posenet.cuda()) # , device_ids=[0, 1, 2, 3]
# module.cuda() only move the registered parameters to GPU.
criterion.cuda()
torch.backends.cudnn.benchmark = True # 如果我们每次训练的输入数据的size不变,那么开启这个就会加快我们的训练速度
# torch.backends.cudnn.deterministic = True
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.2, last_epoch=-1) # 设置学习率下降策略
for i in range(start_epoch):
# update the learning rate for start_epoch times
scheduler.step()
for param in posenet.parameters():
if param.requires_grad:
print('Parameters of network: Autograd')
break
def train(epoch):
print('\n ############################# Train phase, Epoch: {} #############################'.format(epoch))
posenet.train()
train_loss = 0
scheduler.step()
print('\nLearning rate at this epoch is: ', optimizer.param_groups[0]['lr'], '\n')
for batch_idx, target_tuple in enumerate(train_loader):
# images.requires_grad_()
# loc_targets.requires_grad_()
# conf_targets.requires_grad_()
if use_cuda:
target_tuple = [target_tensor.cuda(non_blocking=True) for target_tensor in target_tuple]
# target tensor shape: [8,512,512,3], [8, 1, 128,128], [8,43,128,128], [8,36,128,128], [8,36,128,128]
images, mask_misses, heatmaps = target_tuple # , offsets, mask_offsets
# images = Variable(images)
# loc_targets = Variable(loc_targets)
# conf_targets = Variable(conf_targets)
optimizer.zero_grad() # zero the gradient buff
output_tuple = posenet(images)
# print(loc_preds.requires_grad)
# print(conf_preds.requires_grad)
loss = criterion(output_tuple, target_tuple[1:])
if loss.item() > 1e6:
print("Loss is abnormal, drop this batch !")
continue
# print(loss.requires_grad)
loss.backward() # retain_graph=True
optimizer.step() # TODO:可以使用累加的loss变相增大batch size,但对于bn层需要减少默认的momentum
train_loss += loss.item() # 累加的loss
# 使用loss += loss.detach()来获取不需要梯度回传的部分。
# 或者使用loss.item()直接获得所对应的python数据类型,但是仅仅限于only one element tensors can be converted to Python scalars
print('########################### Epoch:', epoch, ', -- batch:', batch_idx, '/', len(train_loader), ', ',
'Train loss: %.3f, accumulated average loss: %.3f ##############################' % (loss.item(), train_loss / (batch_idx + 1)))
global best_loss
train_loss /= len(train_loader)
if train_loss < best_loss:
print('saving...')
state = {
# not posenet.state_dict(). then, we don't ge the "module" string to begin with
'weights': posenet.module.state_dict(),
'optimizer_weight': optimizer.state_dict(),
'train_loss': train_loss,
'epoch': epoch,
}
os.makedirs('checkpoints', exist_ok=True)
torch.save(state, './checkpoints/PoseNet_' + str(epoch) + '_epoch.pth')
best_loss = train_loss
def test(epoch, show_image=False):
print('\nTest phase, Epoch: {}'.format(epoch))
posenet.eval()
with torch.no_grad(): # will save gpu memory and speed up
test_loss = 0
for batch_idx, target_tuple in enumerate(val_loader):
# images.requires_grad_()
# loc_targets.requires_grad_()
# conf_targets.requires_grad_()
if use_cuda:
target_tuple = [target_tensor.cuda(non_blocking=True) for target_tensor in target_tuple]
# target tensor shape: [8,512,512,3], [8, 1, 128,128], [8,43,128,128], [8,36,128,128], [8,36,128,128]
images, mask_misses, heatmaps, offsets, mask_offsets = target_tuple
# images = Variable(images)
# loc_targets = Variable(loc_targets)
# conf_targets = Variable(conf_targets)
output_tuple = posenet(images)
loss = criterion(output_tuple, target_tuple[1:])
test_loss += loss.item() # 累加的loss
print(' Test loss : %.3f, accumulated average loss: %.3f' % (loss.item(), test_loss / (batch_idx + 1)))
# ############### For Debug ##########
if show_image:
image, mask_miss, labels, offsets, mask_offset = [v.cpu().numpy() for v in target_tuple]
output = output_tuple[-1][0].cpu().numpy()
# show the generated ground truth
img = image[0]
output = output[0].transpose((1, 2, 0))
img = cv2.resize(img, output.shape[:2], interpolation=cv2.INTER_CUBIC)
plt.imshow(img[:, :, [2, 1, 0]]) # Opencv image format: BGR
plt.imshow(output[:, :, 28], alpha=0.5) # mask_all
# plt.imshow(mask_offset[:, :, 2], alpha=0.5) # mask_all
plt.show()
# #######################################
if __name__ == '__main__':
for epoch in range(start_epoch, start_epoch + 200):
train(epoch)
test(epoch, show_image=False)