Skip to content

Commit

Permalink
Factor out and group common CLI args (#34)
Browse files Browse the repository at this point in the history
  • Loading branch information
Myle Ott authored Jul 29, 2017
1 parent a15a552 commit fb58369
Show file tree
Hide file tree
Showing 3 changed files with 145 additions and 132 deletions.
77 changes: 16 additions & 61 deletions generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,69 +9,24 @@
import bleu
import data
import models
import options
import utils
from meters import StopwatchMeter, TimeMeter


parser = argparse.ArgumentParser(description='Convolutional Sequence to Sequence Generation')
parser.add_argument('data', metavar='DIR',
help='path to data directory')
parser.add_argument('--path', metavar='FILE', default='./checkpoint_best.pt',
parser = options.get_parser('Generation')
parser.add_argument('--path', metavar='FILE', required=True, default='./checkpoint_best.pt',
help='path to model file')

# dataset and data loading
parser.add_argument('--subset', default='test', metavar='SPLIT',
choices=['train', 'valid', 'test'],
help='data subset to generate (train, valid, test)')
parser.add_argument('--batch-size', default=32, type=int, metavar='N',
help='batch size')

# generation configuration
parser.add_argument('--beam', default=5, type=int, metavar='N',
help='beam size')
parser.add_argument('--nbest', default=1, type=int, metavar='N',
help='number of hypotheses to output')
parser.add_argument('--max-len-a', default=0, type=int, metavar='N',
help=('generate sequence of maximum length ax + b, '
'where x is the source length'))
parser.add_argument('--max-len-b', default=200, type=int, metavar='N',
help=('generate sequence of maximum length ax + b, '
'where x is the source length'))
parser.add_argument('--no-early-stop', action='store_true',
help=('continue searching even after finalizing k=beam '
'hypotheses; this is more correct, but increases '
'generation time by 50%%'))
parser.add_argument('--unnormalized', action='store_true',
help='compare unnormalized hypothesis scores')

# misc
parser.add_argument('--cpu', action='store_true', help='generate on CPU')
parser.add_argument('--beamable-mm', action='store_true',
help='use BeamableMM in attention layers')
parser.add_argument('--no-progress-bar', action='store_true',
help='disable progress bar')

# model configuration
# TODO infer this from model file
parser.add_argument('--arch', '-a', default='fconv', metavar='ARCH',
choices=models.__all__,
help='model architecture ({})'.format(', '.join(models.__all__)))
parser.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
parser.add_argument('--label-smoothing', default=0, type=float, metavar='D',
help='epsilon for label smoothing, 0 means no label smoothing')
parser.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N',
help='decoder output embedding dimension')
dataset_args = options.add_dataset_args(parser)
dataset_args.add_argument('--batch-size', default=32, type=int, metavar='N',
help='batch size')
dataset_args.add_argument('--gen-subset', default='test', metavar='SPLIT',
choices=['train', 'valid', 'test'],
help='data subset to generate (train, valid, test)')

options.add_generation_args(parser)
options.add_model_args(parser)


def main():
Expand All @@ -83,10 +38,10 @@ def main():
progress_bar.enabled = False
use_cuda = torch.cuda.is_available() and not args.cpu

dataset = data.load(args.data)
dataset = data.load(args.data, args.source_lang, args.target_lang)
print('| [{}] dictionary: {} types'.format(dataset.src, len(dataset.src_dict)))
print('| [{}] dictionary: {} types'.format(dataset.dst, len(dataset.dst_dict)))
print('| {} {} {} examples'.format(args.data, args.subset, len(dataset.splits[args.subset])))
print('| {} {} {} examples'.format(args.data, args.gen_subset, len(dataset.splits[args.gen_subset])))

# TODO infer architecture from model file
print('| model {}'.format(args.arch))
Expand Down Expand Up @@ -115,7 +70,7 @@ def display_hypotheses(id, src, ref, hypos):

# Generate and compute BLEU score
scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos())
itr = dataset.dataloader(args.subset, batch_size=args.batch_size)
itr = dataset.dataloader(args.gen_subset, batch_size=args.batch_size)
num_sentences = 0
with progress_bar(itr, smoothing=0, leave=False) as t:
wps_meter = TimeMeter()
Expand All @@ -131,7 +86,7 @@ def display_hypotheses(id, src, ref, hypos):
num_sentences += 1
print('| Translated {} sentences ({} tokens) in {:.1f}s ({:.2f} tokens/s)'.format(
num_sentences, gen_timer.n, gen_timer.sum, 1. / gen_timer.avg))
print('| Generate {} with beam={}: BLEU4 = {:2.2f}'.format(args.subset, args.beam, scorer.score()))
print('| Generate {} with beam={}: BLEU4 = {:2.2f}'.format(args.gen_subset, args.beam, scorer.score()))


def to_sentence(dict, tokens):
Expand Down
105 changes: 105 additions & 0 deletions options.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
import argparse

import models


def get_parser(desc):
parser = argparse.ArgumentParser(
description='Facebook AI Research Sequence-to-Sequence Toolkit -- ' + desc)
parser.add_argument('--no-progress-bar', action='store_true', help='disable progress bar')
parser.add_argument('--log-interval', type=int, default=1000, metavar='N',
help='log progress every N updates (when progress bar is disabled)')
parser.add_argument('--seed', default=1, type=int, metavar='N',
help='pseudo random number generator seed')
return parser


def add_dataset_args(parser):
group = parser.add_argument_group('Dataset and data loading')
group.add_argument('data', metavar='DIR',
help='path to data directory')
group.add_argument('-s', '--source-lang', default=None, metavar='SRC',
help='source language')
group.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
help='target language')
group.add_argument('-j', '--workers', default=1, type=int, metavar='N',
help='number of data loading workers (default: 1)')
return group


def add_optimization_args(parser):
group = parser.add_argument_group('Optimization')
group.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR',
help='initial learning rate')
group.add_argument('--min-lr', metavar='LR', default=1e-5, type=float,
help='minimum learning rate')
group.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N',
help='force annealing at specified epoch')
group.add_argument('--lrshrink', default=0.1, type=float, metavar='LS',
help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)')
group.add_argument('--momentum', default=0.99, type=float, metavar='M',
help='momentum factor')
group.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
help='clip threshold of gradients')
group.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
help='weight decay')
group.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')
group.add_argument('--label-smoothing', default=0, type=float, metavar='D',
help='epsilon for label smoothing, 0 means no label smoothing')
return group


def add_checkpoint_args(parser):
group = parser.add_argument_group('Checkpointing')
group.add_argument('--save-dir', metavar='DIR', default='checkpoints',
help='path to save checkpoints')
group.add_argument('--restore-file', default='checkpoint_last.pt',
help='filename in save-dir from which to load checkpoint')
group.add_argument('--save-interval', type=int, default=-1,
help='checkpoint every this many batches')
return group


def add_generation_args(parser):
group = parser.add_argument_group('Generation')
group.add_argument('--beam', default=5, type=int, metavar='N',
help='beam size')
group.add_argument('--nbest', default=1, type=int, metavar='N',
help='number of hypotheses to output')
group.add_argument('--max-len-a', default=0, type=int, metavar='N',
help=('generate sequence of maximum length ax + b, '
'where x is the source length'))
group.add_argument('--max-len-b', default=200, type=int, metavar='N',
help=('generate sequence of maximum length ax + b, '
'where x is the source length'))
group.add_argument('--no-early-stop', action='store_true',
help=('continue searching even after finalizing k=beam '
'hypotheses; this is more correct, but increases '
'generation time by 50%%'))
group.add_argument('--unnormalized', action='store_true',
help='compare unnormalized hypothesis scores')
group.add_argument('--cpu', action='store_true', help='generate on CPU')
group.add_argument('--beamable-mm', action='store_true',
help='use BeamableMM in attention layers')
return group


def add_model_args(parser):
group = parser.add_argument_group('Model configuration')
group.add_argument('--arch', '-a', default='fconv', metavar='ARCH',
choices=models.__all__,
help='model architecture ({})'.format(', '.join(models.__all__)))
group.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N',
help='encoder embedding dimension')
group.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
group.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N',
help='decoder embedding dimension')
group.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
group.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
group.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N',
help='decoder output embedding dimension')
return group
95 changes: 24 additions & 71 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,76 +8,31 @@
import bleu
import data
import generate
import models
import options
import utils
from meters import AverageMeter, TimeMeter
from multiprocessing_trainer import MultiprocessingTrainer


parser = argparse.ArgumentParser(description='Convolutional Sequence to Sequence Training')
parser.add_argument('data', metavar='DIR',
help='path to data directory')
parser.add_argument('--arch', '-a', default='fconv', metavar='ARCH',
choices=models.__all__,
help='model architecture ({})'.format(', '.join(models.__all__)))

# dataset and data loading
parser.add_argument('-s', '--source-lang', default=None, metavar='SRC',
help='source language')
parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET',
help='target language')
parser.add_argument('--max-tokens', default=6000, type=int, metavar='N',
help='maximum number of tokens in a batch')
parser.add_argument('-j', '--workers', default=1, type=int, metavar='N',
help='number of data loading workers (default: 1)')

# optimization
parser.add_argument('--lr', '--learning-rate', default=0.25, type=float, metavar='LR',
help='initial learning rate')
parser.add_argument('--min-lr', metavar='LR', default=1e-5, type=float,
help='minimum learning rate')
parser.add_argument('--force-anneal', '--fa', default=0, type=int, metavar='N',
help='force annealing at specified epoch')
parser.add_argument('--lrshrink', default=0.1, type=float, metavar='LS',
help='learning rate shrink factor for annealing, lr_new = (lr * lrshrink)')
parser.add_argument('--momentum', default=0.99, type=float, metavar='M',
help='momentum factor')
parser.add_argument('--clip-norm', default=25, type=float, metavar='NORM',
help='clip threshold of gradients')
parser.add_argument('--weight-decay', '--wd', default=0.0, type=float, metavar='WD',
help='weight decay')
parser.add_argument('--dropout', default=0.1, type=float, metavar='D',
help='dropout probability')

# checkpointing and utilities
parser.add_argument('--save-dir', metavar='DIR', default='checkpoints',
help='path to save checkpoints')
parser.add_argument('--restore-file', default='checkpoint_last.pt',
help='filename in save-dir from which to load checkpoint')
parser.add_argument('--save-interval', type=int, default=-1,
help='checkpoint every this many batches')
parser.add_argument('--no-progress-bar', action='store_true',
help='disable progress bar')
parser.add_argument('--log-interval', type=int, default=1000, metavar='N',
help='log progress every N updates (when progress bar is disabled)')
parser.add_argument('--seed', default=1, type=int, metavar='N',
help='pseudo random number generator seed')

# model configuration
parser.add_argument('--encoder-embed-dim', default=512, type=int, metavar='N',
help='encoder embedding dimension')
parser.add_argument('--encoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='encoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-embed-dim', default=512, type=int, metavar='N',
help='decoder embedding dimension')
parser.add_argument('--decoder-layers', default='[(512, 3)] * 20', type=str, metavar='EXPR',
help='decoder layers [(dim, kernel_size), ...]')
parser.add_argument('--decoder-attention', default='True', type=str, metavar='EXPR',
help='decoder attention [True, ...]')
parser.add_argument('--decoder-out-embed-dim', default=256, type=int, metavar='N',
help='decoder output embedding dimension')
parser.add_argument('--label-smoothing', default=0, type=float, metavar='D',
help='epsilon for label smoothing, 0 means no label smoothing')
parser = options.get_parser('Trainer')

dataset_args = options.add_dataset_args(parser)
dataset_args.add_argument('--max-tokens', default=6000, type=int, metavar='N',
help='maximum number of tokens in a batch')
dataset_args.add_argument('--train-subset', default='train', metavar='SPLIT',
choices=['train', 'valid', 'test'],
help='data subset to use for training (train, valid, test)')
dataset_args.add_argument('--valid-subset', default='valid', metavar='SPLIT',
choices=['train', 'valid', 'test'],
help='data subset to use for validation (train, valid, test)')
dataset_args.add_argument('--test-subset', default='test', metavar='SPLIT',
choices=['train', 'valid', 'test'],
help='data subset to use for testing (train, valid, test)')

options.add_optimization_args(parser)
options.add_checkpoint_args(parser)
options.add_model_args(parser)


def main():
global args
Expand Down Expand Up @@ -142,10 +97,8 @@ def main():
def train(epoch, batch_offset, trainer, dataset, num_gpus):
"""Train the model for one epoch"""

itr = dataset.dataloader('train',
num_workers=args.workers,
max_tokens=args.max_tokens,
seed=(args.seed, epoch))
itr = dataset.dataloader(args.train_subset, num_workers=args.workers,
max_tokens=args.max_tokens, seed=(args.seed, epoch))
loss_meter = AverageMeter()
bsz_meter = AverageMeter() # sentences per batch
wpb_meter = AverageMeter() # words per batch
Expand Down Expand Up @@ -207,7 +160,7 @@ def skip_group_enumerator(it, ngpus, offset=0):
def validate(epoch, trainer, dataset, ngpus):
"""Evaluate the model on the validation set and return the average loss"""

itr = dataset.dataloader('valid', batch_size=None, max_tokens=args.max_tokens)
itr = dataset.dataloader(args.valid_subset, batch_size=None, max_tokens=args.max_tokens)
loss_meter = AverageMeter()

desc = '| val {}'.format(epoch)
Expand All @@ -233,7 +186,7 @@ def score_test(epoch, model, dataset, beam, cuda_device=None):
translator.cuda()

scorer = bleu.Scorer(dataset.dst_dict.pad(), dataset.dst_dict.eos())
itr = dataset.dataloader('test', batch_size=4)
itr = dataset.dataloader(args.test_subset, batch_size=4)
for id, src, ref, hypos in generate.generate_batched_itr(translator, itr, cuda_device=cuda_device):
scorer.add(ref.int().cpu(), hypos[0]['tokens'].int().cpu())
return scorer
Expand Down

0 comments on commit fb58369

Please # to comment.