Skip to content

Commit

Permalink
translation in reverse direction
Browse files Browse the repository at this point in the history
  • Loading branch information
gorkemozkaya committed Jul 4, 2019
1 parent 422dd09 commit 7f966d0
Show file tree
Hide file tree
Showing 4 changed files with 159 additions and 0 deletions.
1 change: 1 addition & 0 deletions generate_data.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#!/usr/bin/env bash
export EN_TR_CORPUS_DIR=/Users/gorkemozkaya/Downloads/nmt_june_2019/en-tr.txt/

PROBLEM=translate_en_tr
Expand Down
20 changes: 20 additions & 0 deletions generate_data_tr_en.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env bash
export EN_TR_CORPUS_DIR=/Users/gorkemozkaya/Downloads/nmt_june_2019/en-tr.txt/

PROBLEM=translate_tr_en
MODEL=transformer
HPARAMS=transformer_base

DATA_DIR=/tmp/t2t/data_tr_en
TMP_DIR=/tmp/tmp_tr_en
TRAIN_DIR=/tmp/t2t/train
USR_DIR=/Users/gorkemozkaya/Projects/NMT/nmt-en-tr/t2t/problems/nmt-en-tr

mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR

# Generate data
t2t-datagen \
--t2t_usr_dir=$USR_DIR \
--data_dir=$DATA_DIR \
--tmp_dir=$TMP_DIR \
--problem=$PROBLEM
24 changes: 24 additions & 0 deletions t2t/decode_interactive.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
OUT_DIR=/Users/gorkemozkaya/Projects/NMT/storage_bucket/train_v2
MODEL=transformer
HPARAMS=transformer_tpu
PROBLEM=translate_en_tr

USR_DIR=/Users/gorkemozkaya/Projects/NMT/nmt-en-tr/t2t/problems/nmt-en-tr
DATA_DIR=/tmp/t2t/data

#mkdir -p $DATA_DIR $TMP_DIR $TRAIN_DIR

# Generate data

BEAM_SIZE=4
ALPHA=0.6

t2t-decoder \
--t2t_usr_dir=$USR_DIR \
--data_dir=$DATA_DIR \
--problem=$PROBLEM \
--model=$MODEL \
--hparams_set=$HPARAMS \
--output_dir=$OUT_DIR \
--decode_hparams="beam_size=$BEAM_SIZE,alpha=$ALPHA" \
--decode_interactive=True
114 changes: 114 additions & 0 deletions t2t/problems/nmt-en-tr/probdef.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,3 +168,117 @@ def max_subtoken_length(self):
# END: Overridable methods.


@registry.register_problem

# We inherit from `Text2TextProblem` which takes care of a lot of details
# regarding reading and writing the data to disk, what vocabulary type one
# should use, its size etc -- so that we need not worry about them, one can,
# of course, override those.
class TranslateTrEn(text_problems.Text2TextProblem):
"""Translate English to Turkish"""

# START: Methods we should override.

# The methods that need to be overriden from `Text2TextProblem` are:
# `is_generate_per_split` and
# `generate_samples`.

@property
def is_generate_per_split(self):
# If we have pre-existing data splits for (train, eval, test) then we set
# this to True, which will have generate_samples be called for each of the
# dataset_splits.
#
# If we do not have pre-existing data splits, we set this to False, which
# will have generate_samples be called just once and the Problem will
# automatically partition the data into dataset_splits.
return False

def generate_samples(self, data_dir, tmp_dir, dataset_split):

import re
re0 = re.compile('\w.*|$')

def preprocess(x):
"""
Remove the non word characters at the beginning of a sentence.
:param x: Input string
:return: cleaned version of the string
"""
return re0.search(x).group()

en_tr_corpus_dir = os.environ["EN_TR_CORPUS_DIR"]

# PART 1 - OPEN SUBTITLES CORPUS
with open(en_tr_corpus_dir + 'OpenSubtitles.en-tr.en') as f_en, open(
en_tr_corpus_dir + 'OpenSubtitles.en-tr.tr') as f_tr:
data_iterator = zip(f_tr, f_en)
t = 0
for sentence_input, sentence_target in data_iterator:
t += 1
source = preprocess(sentence_input)
target = preprocess(sentence_target)
if t % 50 == 0: # downsampling
yield {
"inputs": source,
"targets": target,
}
# PART 2 - NEWS ARTICLES CORPUS
with open(en_tr_corpus_dir + 'SETIMES2.en-tr.en') as f_en, open(en_tr_corpus_dir + 'SETIMES2.en-tr.tr') as f_tr:
data_iterator = zip(f_tr, f_en)
for sentence_input, sentence_target in data_iterator:
source = preprocess(sentence_input)
target = preprocess(sentence_target)

yield {
"inputs" : source,
"targets" : target,
}



# END: Methods we should override.

# START: Overridable methods.

@property
def vocab_type(self):
# We can use different types of vocabularies, `VocabType.CHARACTER`,
# `VocabType.SUBWORD` and `VocabType.TOKEN`.
#
# SUBWORD and CHARACTER are fully invertible -- but SUBWORD provides a good
# tradeoff between CHARACTER and TOKEN.
return text_problems.VocabType.SUBWORD

@property
def approx_vocab_size(self):
# Approximate vocab size to generate. Only for VocabType.SUBWORD.
return 2**14 # ~16k

@property
def dataset_splits(self):
# Since we are responsible for generating the dataset splits, we override
# `Text2TextProblem.dataset_splits` to specify that we intend to keep
# 80% data for training and 10% for evaluation and testing each.
return [{
"split": problem.DatasetSplit.TRAIN,
"shards": 8,
}, {
"split": problem.DatasetSplit.EVAL,
"shards": 1,
}, {
"split": problem.DatasetSplit.TEST,
"shards": 1,
}]

@property
def max_subtoken_length(self):
"""Maximum subtoken length when generating vocab.
SubwordTextEncoder vocabulary building is quadratic-time wrt this variable,
setting it to None uses the length of the longest token in the corpus.
Returns:
an integer or None
"""
return 8

# END: Overridable methods.

0 comments on commit 7f966d0

Please # to comment.