-
Notifications
You must be signed in to change notification settings - Fork 39
/
main.py
93 lines (76 loc) · 3.74 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
"""
main.py
Created by zenn at 2021/7/18 15:08
"""
import pytorch_lightning as pl
import argparse
import pytorch_lightning.utilities.distributed
import torch
import yaml
from easydict import EasyDict
import os
from pytorch_lightning.callbacks import ModelCheckpoint
from torch.utils.data import DataLoader
from datasets import get_dataset
from models import get_model
# os.environ["NCCL_DEBUG"] = "INFO"
def load_yaml(file_name):
with open(file_name, 'r') as f:
try:
config = yaml.load(f, Loader=yaml.FullLoader)
except:
config = yaml.load(f)
return config
def parse_config():
parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int, default=100, help='input batch size')
parser.add_argument('--epoch', type=int, default=60, help='number of epochs')
parser.add_argument('--save_top_k', type=int, default=-1, help='save top k checkpoints')
parser.add_argument('--check_val_every_n_epoch', type=int, default=1, help='check_val_every_n_epoch')
parser.add_argument('--workers', type=int, default=10, help='number of data loading workers')
parser.add_argument('--cfg', type=str, help='the config_file')
parser.add_argument('--checkpoint', type=str, default=None, help='checkpoint location')
parser.add_argument('--log_dir', type=str, default=None, help='log location')
parser.add_argument('--test', action='store_true', default=False, help='test mode')
parser.add_argument('--preloading', action='store_true', default=False, help='preload dataset into memory')
args = parser.parse_args()
config = load_yaml(args.cfg)
config.update(vars(args)) # override the configuration using the value in args
return EasyDict(config)
cfg = parse_config()
env_cp = os.environ.copy()
try:
node_rank, local_rank, world_size = env_cp['NODE_RANK'], env_cp['LOCAL_RANK'], env_cp['WORLD_SIZE']
is_in_ddp_subprocess = env_cp['PL_IN_DDP_SUBPROCESS']
pl_trainer_gpus = env_cp['PL_TRAINER_GPUS']
print(node_rank, local_rank, world_size, is_in_ddp_subprocess, pl_trainer_gpus)
if int(local_rank) == int(world_size) - 1:
print(cfg)
except KeyError:
pass
# init model
if cfg.checkpoint is None:
net = get_model(cfg.net_model)(cfg)
else:
net = get_model(cfg.net_model).load_from_checkpoint(cfg.checkpoint, config=cfg)
if not cfg.test:
# dataset and dataloader
train_data = get_dataset(cfg, type=cfg.train_type, split=cfg.train_split)
val_data = get_dataset(cfg, type='test', split=cfg.val_split)
train_loader = DataLoader(train_data, batch_size=cfg.batch_size, num_workers=cfg.workers, shuffle=True,drop_last=True,
pin_memory=True)
val_loader = DataLoader(val_data, batch_size=1, num_workers=cfg.workers, collate_fn=lambda x: x, pin_memory=True)
checkpoint_callback = ModelCheckpoint(monitor='precision/test', mode='max', save_last=True,
save_top_k=cfg.save_top_k)
# init trainer
trainer = pl.Trainer(gpus=-1, accelerator='ddp', max_epochs=cfg.epoch, resume_from_checkpoint=cfg.checkpoint,
callbacks=[checkpoint_callback], default_root_dir=cfg.log_dir,
check_val_every_n_epoch=cfg.check_val_every_n_epoch, num_sanity_val_steps=2,
gradient_clip_val=cfg.gradient_clip_val)
trainer.fit(net, train_loader, val_loader)
else:
test_data = get_dataset(cfg, type='test', split=cfg.test_split)
test_loader = DataLoader(test_data, batch_size=1, num_workers=cfg.workers, collate_fn=lambda x: x, pin_memory=True)
trainer = pl.Trainer(gpus=-1, accelerator='ddp', default_root_dir=cfg.log_dir,
resume_from_checkpoint=cfg.checkpoint)
trainer.test(net, test_loader)