From fb58369f996bbb6b4d978ec7aae68b4ea8b8d1cc Mon Sep 17 00:00:00 2001 From: Myle Ott Date: Sat, 29 Jul 2017 18:47:09 -0400 Subject: [PATCH] Factor out and group common CLI args (#34) --- generate.py | 77 ++++++++------------------------------ options.py | 105 ++++++++++++++++++++++++++++++++++++++++++++++++++++ train.py | 95 ++++++++++++----------------------------------- 3 files changed, 145 insertions(+), 132 deletions(-) create mode 100644 options.py diff --git a/generate.py b/generate.py index 15adecbed6..89d5fc11eb 100644 --- a/generate.py +++ b/generate.py @@ -9,69 +9,24 @@ import bleu import data import models +import options import utils from meters import StopwatchMeter, TimeMeter -parser = argparse.ArgumentParser(description='Convolutional Sequence to Sequence Generation') -parser.add_argument('data', metavar='DIR', - help='path to data directory') -parser.add_argument('--path', metavar='FILE', default='./checkpoint_best.pt', +parser = options.get_parser('Generation') +parser.add_argument('--path', metavar='FILE', required=True, default='./checkpoint_best.pt', help='path to model file') -# dataset and data loading -parser.add_argument('--subset', default='test', metavar='SPLIT', - choices=['train', 'valid', 'test'], - help='data subset to generate (train, valid, test)') -parser.add_argument('--batch-size', default=32, type=int, metavar='N', - help='batch size') - -# generation configuration -parser.add_argument('--beam', default=5, type=int, metavar='N', - help='beam size') -parser.add_argument('--nbest', default=1, type=int, metavar='N', - help='number of hypotheses to output') -parser.add_argument('--max-len-a', default=0, type=int, metavar='N', - help=('generate sequence of maximum length ax + b, ' - 'where x is the source length')) -parser.add_argument('--max-len-b', default=200, type=int, metavar='N', - help=('generate sequence of maximum length ax + b, ' - 'where x is the source length')) -parser.add_argument('--no-early-stop', action='store_true', - help=('continue searching even after finalizing k=beam ' - 'hypotheses; this is more correct, but increases ' - 'generation time by 50%%')) -parser.add_argument('--unnormalized', action='store_true', - help='compare unnormalized hypothesis scores') - -# misc -parser.add_argument('--cpu', action='store_true', help='generate on CPU') -parser.add_argument('--beamable-mm', action='store_true', - help='use BeamableMM in attention layers') -parser.add_argument('--no-progress-bar', action='store_true', - help='disable progress bar') - -# model configuration -# TODO infer this from model file -parser.add_argument('--arch', '-a', default='fconv', metavar='ARCH', - choices=models.__all__, - help='model architecture ({})'.format(', '.join(models.__all__))) -parser.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N', - help='encoder embedding dimension') -parser.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR', - help='encoder layers [(dim, kernel_size), ...]') -parser.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N', - help='decoder embedding dimension') -parser.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR', - help='decoder layers [(dim, kernel_size), ...]') -parser.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR', - help='decoder attention [True, ...]') -parser.add_argument('--dropout', default=0.1, type=float, metavar='D', - help='dropout probability') -parser.add_argument('--label-smoothing', default=0, type=float, metavar='D', - help='epsilon for label smoothing, 0 means no label smoothing') -parser.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N', - help='decoder output embedding dimension') +dataset_args = options.add_dataset_args(parser) +dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N', + help='batch size') +dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT', + choices=['train', 'valid', 'test'], + help='data subset to generate (train, valid, test)') + +options.add_generation_args(parser) +options.add_model_args(parser) def main(): @@ -83,10 +38,10 @@ def main(): progress_bar.enabled = False use_cuda = torch.cuda.is_available() and not args.cpu - dataset = data.load(args.data) + dataset = data.load(args.data, args.source_lang, args.target_lang) print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict))) print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict))) - print('| {} {} {} examples'.format(args.data, args.subset, len(dataset.splits[args.subset]))) + print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset]))) # TODO infer architecture from model file print('| model {}'.format(args.arch)) @@ -115,7 +70,7 @@ def display_hypotheses(id, src, ref, hypos): # Generate and compute BLEU score scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos()) - itr = dataset.dataloader(args.subset, batch_size=args.batch_size) + itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size) num_sentences = 0 with progress_bar(itr, smoothing=0, leave=False) as t: wps_meter = TimeMeter() @@ -131,7 +86,7 @@ def display_hypotheses(id, src, ref, hypos): num_sentences += 1 print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format( num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg)) - print('| Generate {} with beam={}: BLEU4 = {:2.2f}'.format(args.subset, args.beam, scorer.score())) + print('| Generate {} with beam={}: BLEU4 = {:2.2f}'.format(args.gen_subset, args.beam, scorer.score())) def to_sentence(dict, tokens): diff --git a/options.py b/options.py new file mode 100644 index 0000000000..0a081c454a --- /dev/null +++ b/options.py @@ -0,0 +1,105 @@ +import argparse + +import models + + +def get_parser(desc): + parser = argparse.ArgumentParser( + description='Facebook AI Research Sequence-to-Sequence Toolkit -- ' + desc) + parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar') + parser.add_argument('--log-interval', type=int, default=1000, metavar='N', + help='log progress every N updates (when progress bar is disabled)') + parser.add_argument('--seed', default=1, type=int, metavar='N', + help='pseudo random number generator seed') + return parser + + +def add_dataset_args(parser): + group = parser.add_argument_group('Dataset and data loading') + group.add_argument('data', metavar='DIR', + help='path to data directory') + group.add_argument('-s', '--source-lang', default=None, metavar='SRC', + help='source language') + group.add_argument('-t', '--target-lang', default=None, metavar='TARGET', + help='target language') + group.add_argument('-j', '--workers', default=1, type=int, metavar='N', + help='number of data loading workers (default: 1)') + return group + + +def add_optimization_args(parser): + group = parser.add_argument_group('Optimization') + group.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR', + help='initial learning rate') + group.add_argument('--min-lr', metavar='LR', default=1e-5, type=float, + help='minimum learning rate') + group.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N', + help='force annealing at specified epoch') + group.add_argument('--lrshrink', default=0.1, type=float, metavar='LS', + help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)') + group.add_argument('--momentum', default=0.99, type=float, metavar='M', + help='momentum factor') + group.add_argument('--clip-norm', default=25, type=float, metavar='NORM', + help='clip threshold of gradients') + group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', + help='weight decay') + group.add_argument('--dropout', default=0.1, type=float, metavar='D', + help='dropout probability') + group.add_argument('--label-smoothing', default=0, type=float, metavar='D', + help='epsilon for label smoothing, 0 means no label smoothing') + return group + + +def add_checkpoint_args(parser): + group = parser.add_argument_group('Checkpointing') + group.add_argument('--save-dir', metavar='DIR', default='checkpoints', + help='path to save checkpoints') + group.add_argument('--restore-file', default='checkpoint_last.pt', + help='filename in save-dir from which to load checkpoint') + group.add_argument('--save-interval', type=int, default=-1, + help='checkpoint every this many batches') + return group + + +def add_generation_args(parser): + group = parser.add_argument_group('Generation') + group.add_argument('--beam', default=5, type=int, metavar='N', + help='beam size') + group.add_argument('--nbest', default=1, type=int, metavar='N', + help='number of hypotheses to output') + group.add_argument('--max-len-a', default=0, type=int, metavar='N', + help=('generate sequence of maximum length ax + b, ' + 'where x is the source length')) + group.add_argument('--max-len-b', default=200, type=int, metavar='N', + help=('generate sequence of maximum length ax + b, ' + 'where x is the source length')) + group.add_argument('--no-early-stop', action='store_true', + help=('continue searching even after finalizing k=beam ' + 'hypotheses; this is more correct, but increases ' + 'generation time by 50%%')) + group.add_argument('--unnormalized', action='store_true', + help='compare unnormalized hypothesis scores') + group.add_argument('--cpu', action='store_true', help='generate on CPU') + group.add_argument('--beamable-mm', action='store_true', + help='use BeamableMM in attention layers') + return group + + +def add_model_args(parser): + group = parser.add_argument_group('Model configuration') + group.add_argument('--arch', '-a', default='fconv', metavar='ARCH', + choices=models.__all__, + help='model architecture ({})'.format(', '.join(models.__all__))) + group.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N', + help='encoder embedding dimension') + group.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR', + help='encoder layers [(dim, kernel_size), ...]') + group.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N', + help='decoder embedding dimension') + group.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR', + help='decoder layers [(dim, kernel_size), ...]') + group.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR', + help='decoder attention [True, ...]') + group.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N', + help='decoder output embedding dimension') + return group diff --git a/train.py b/train.py index 6a4114417a..024f3ba783 100644 --- a/train.py +++ b/train.py @@ -8,76 +8,31 @@ import bleu import data import generate -import models +import options import utils from meters import AverageMeter, TimeMeter from multiprocessing_trainer import MultiprocessingTrainer -parser = argparse.ArgumentParser(description='Convolutional Sequence to Sequence Training') -parser.add_argument('data', metavar='DIR', - help='path to data directory') -parser.add_argument('--arch', '-a', default='fconv', metavar='ARCH', - choices=models.__all__, - help='model architecture ({})'.format(', '.join(models.__all__))) - -# dataset and data loading -parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', - help='source language') -parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', - help='target language') -parser.add_argument('--max-tokens', default=6000, type=int, metavar='N', - help='maximum number of tokens in a batch') -parser.add_argument('-j', '--workers', default=1, type=int, metavar='N', - help='number of data loading workers (default: 1)') - -# optimization -parser.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR', - help='initial learning rate') -parser.add_argument('--min-lr', metavar='LR', default=1e-5, type=float, - help='minimum learning rate') -parser.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N', - help='force annealing at specified epoch') -parser.add_argument('--lrshrink', default=0.1, type=float, metavar='LS', - help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)') -parser.add_argument('--momentum', default=0.99, type=float, metavar='M', - help='momentum factor') -parser.add_argument('--clip-norm', default=25, type=float, metavar='NORM', - help='clip threshold of gradients') -parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD', - help='weight decay') -parser.add_argument('--dropout', default=0.1, type=float, metavar='D', - help='dropout probability') - -# checkpointing and utilities -parser.add_argument('--save-dir', metavar='DIR', default='checkpoints', - help='path to save checkpoints') -parser.add_argument('--restore-file', default='checkpoint_last.pt', - help='filename in save-dir from which to load checkpoint') -parser.add_argument('--save-interval', type=int, default=-1, - help='checkpoint every this many batches') -parser.add_argument('--no-progress-bar', action='store_true', - help='disable progress bar') -parser.add_argument('--log-interval', type=int, default=1000, metavar='N', - help='log progress every N updates (when progress bar is disabled)') -parser.add_argument('--seed', default=1, type=int, metavar='N', - help='pseudo random number generator seed') - -# model configuration -parser.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N', - help='encoder embedding dimension') -parser.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR', - help='encoder layers [(dim, kernel_size), ...]') -parser.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N', - help='decoder embedding dimension') -parser.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR', - help='decoder layers [(dim, kernel_size), ...]') -parser.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR', - help='decoder attention [True, ...]') -parser.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N', - help='decoder output embedding dimension') -parser.add_argument('--label-smoothing', default=0, type=float, metavar='D', - help='epsilon for label smoothing, 0 means no label smoothing') +parser = options.get_parser('Trainer') + +dataset_args = options.add_dataset_args(parser) +dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N', + help='maximum number of tokens in a batch') +dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT', + choices=['train', 'valid', 'test'], + help='data subset to use for training (train, valid, test)') +dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT', + choices=['train', 'valid', 'test'], + help='data subset to use for validation (train, valid, test)') +dataset_args.add_argument('--test-subset', default='test', metavar='SPLIT', + choices=['train', 'valid', 'test'], + help='data subset to use for testing (train, valid, test)') + +options.add_optimization_args(parser) +options.add_checkpoint_args(parser) +options.add_model_args(parser) + def main(): global args @@ -142,10 +97,8 @@ def main(): def train(epoch, batch_offset, trainer, dataset, num_gpus): """Train the model for one epoch""" - itr = dataset.dataloader('train', - num_workers=args.workers, - max_tokens=args.max_tokens, - seed=(args.seed, epoch)) + itr = dataset.dataloader(args.train_subset, num_workers=args.workers, + max_tokens=args.max_tokens, seed=(args.seed, epoch)) loss_meter = AverageMeter() bsz_meter = AverageMeter() # sentences per batch wpb_meter = AverageMeter() # words per batch @@ -207,7 +160,7 @@ def skip_group_enumerator(it, ngpus, offset=0): def validate(epoch, trainer, dataset, ngpus): """Evaluate the model on the validation set and return the average loss""" - itr = dataset.dataloader('valid', batch_size=None, max_tokens=args.max_tokens) + itr = dataset.dataloader(args.valid_subset, batch_size=None, max_tokens=args.max_tokens) loss_meter = AverageMeter() desc = '| val {}'.format(epoch) @@ -233,7 +186,7 @@ def score_test(epoch, model, dataset, beam, cuda_device=None): translator.cuda() scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos()) - itr = dataset.dataloader('test', batch_size=4) + itr = dataset.dataloader(args.test_subset, batch_size=4) for id, src, ref, hypos in generate.generate_batched_itr(translator, itr, cuda_device=cuda_device): scorer.add(ref.int().cpu(), hypos[0]['tokens'].int().cpu()) return scorer