-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtraining.py
84 lines (69 loc) · 3.18 KB
/
training.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import torch
import wandb
import tqdm
import gc
def train(model, train_loader, validation_loader, epochs, optimizer, evaluation_per_step=10, acc_step=1):
wandb.watch(model, log_freq=evaluation_per_step)
for epoch in range(epochs):
print(
f'+++++++++++++++++++++++++++++++++++++++++ epoch: {epoch + 1} ++++++++++++++++++++++++++++++++++++++++++++')
print('training...')
cnt = 0
total_loss = 0
model.train()
for i, (element1, element2) in enumerate(train_loader):
print(f'--------------------------- train loader {i} ----------------------------------')
cnt += 1
# to cuda?
# element1 = element1.to('cuda')
# element2 = element2.to('cuda')
# evaluate the model every "evaluation_per_step"s
if cnt % evaluation_per_step == 0:
accuracy1 = evaluate_model(model, validation_loader)
accuracy2 = evaluate_model(model, train_loader)
accuracy1 = accuracy1.view(-1).cpu().item()
accuracy2 = accuracy2.view(-1).cpu().item()
print(f'count: {cnt}')
print(f'********accuracy: {accuracy1} ********')
print(f'********loss: {total_loss / evaluation_per_step} ********')
wandb.log({"train/train-acc": accuracy2, "train/eval-acc": accuracy1,
"train/loss": total_loss / evaluation_per_step})
total_loss = 0
# if i % 20 == 0:
# torch.cuda.empty_cache()
# print('torch cache cleaned!')
# torch.cuda.empty_cache()
# print('torch cache cleaned!')
if cnt % acc_step == 0:
loss = model.get_loss(model((element1, element2)))
total_loss += loss.detach().cpu().item()
optimizer.zero_grad()
# clean cuda cache (otherwise it will cause cuda out of memory error(っ °Д °;)っ)
gc.collect()
torch.cuda.empty_cache()
print('torch cache cleaned!')
print(f'start backward {cnt}...')
loss.backward()
print(f'successfully end backward {cnt}')
optimizer.step()
else:
loss = model.get_loss(model((element1, element2)))
loss.backward()
# torch.save(model.state_dict(), './autodl-tmp/saved_model/' + str(epoch + 100) + '.pth')
torch.save(model.state_dict(), './saved_models/' + str(epoch + 100) + '.pth')
print(f'saved epoch {epoch} model successfully!')
def evaluate_model(model, loader):
print('##########evaluating#################')
model.eval()
with torch.no_grad():
# correct = torch.tensor([0]).cuda()
# total = torch.tensor([0]).cuda()
correct = 0
total = 0
for i, (element1, element2) in enumerate(tqdm.tqdm(loader)):
right, num = model.get_accuracy(model((element1, element2)))
right.to(torch.device('cpu'))
correct += right
total += num
print('##########evaluate successful!########')
return correct / total