Skip to content
New issue

Have a question about this project? # for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “#”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? # to your account

refactor runtime of deltann #150

Merged
merged 2 commits into from
Oct 24, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions delta/data/feat/speech_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,8 +304,8 @@ def _loop_body(time, inputs, output_tas):
loop_vars = (time, waveforms, output_tas)

parallel_iterations = 10
shape_invariants = tf.nest.map_structure(
lambda t: tf.TensorShape(None), loop_vars)
shape_invariants = tf.nest.map_structure(lambda t: tf.TensorShape(None),
loop_vars)

(time, inputs, output_tas) = tf.while_loop(
_loop_continue,
Expand Down Expand Up @@ -362,8 +362,8 @@ def _loop_body(time, end_time, context, left_context, right_context,
loop_vars = (time, T, context, left_context, right_context, output_tas)

parallel_iterations = 10
shape_invariants = tf.nest.map_structure(
lambda t: tf.TensorShape(None), loop_vars)
shape_invariants = tf.nest.map_structure(lambda t: tf.TensorShape(None),
loop_vars)

(time, end_time, context, left_context, right_context,
output_tas) = tf.while_loop(
Expand Down
11 changes: 4 additions & 7 deletions delta/data/feat/tf_speech_feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ def compute_mel_filterbank_features(waveforms,
frame_step=10,
fft_length=None,
window_fn=functools.partial(
tf.signal.hann_window,
periodic=True),
tf.signal.hann_window, periodic=True),
lower_edge_hertz=80.0,
upper_edge_hertz=7600.0,
num_mel_bins=80,
Expand Down Expand Up @@ -130,11 +129,9 @@ def compute_mel_filterbank_features(waveforms,
# Warp the linear-scale, magnitude spectrograms into the mel-scale.
num_spectrogram_bins = magnitude_spectrograms.shape[-1].value
linear_to_mel_weight_matrix = (
tf.signal.linear_to_mel_weight_matrix(num_mel_bins,
num_spectrogram_bins,
sample_rate,
lower_edge_hertz,
upper_edge_hertz))
tf.signal.linear_to_mel_weight_matrix(num_mel_bins, num_spectrogram_bins,
sample_rate, lower_edge_hertz,
upper_edge_hertz))
mel_spectrograms = tf.tensordot(magnitude_spectrograms,
linear_to_mel_weight_matrix, 1)
# Note: Shape inference for tensordot does not currently handle this case.
Expand Down
3 changes: 2 additions & 1 deletion delta/data/frontend/plp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ def test_plp(self):
[0.052763, -0.271487, 0.011329, 0.025320, 0.012851]])

self.assertEqual(tf.rank(plp_test).eval(), 2)
self.assertAllClose(plp_test.eval()[50:55, 5:10], output_true, rtol=1e-05, atol=1e-05)
self.assertAllClose(
plp_test.eval()[50:55, 5:10], output_true, rtol=1e-05, atol=1e-05)


if __name__ == '__main__':
Expand Down
18 changes: 10 additions & 8 deletions delta/data/preprocess/base_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,18 +121,17 @@ def prepare_one_raw_data(self, one_path, one_path_after, mode,
batch_num = int(math.ceil(data_size / float(self.batch_size)))
if self.multi_text:
one_text_after = []
for i, one_text in enumerate(text): #to be confirmed
for i, one_text in enumerate(text): #to be confirmed
one_text_iterator = get_pre_process_text_ds_iter(
one_text, pre_process_pipeline, self.num_parallel_calls,
self.batch_size)
text_after_arr = self.run_dataset(one_text_iterator,batch_num)
text_after_arr = self.run_dataset(one_text_iterator, batch_num)
text_after = [one_line.decode("utf-8") for one_line in text_after_arr]
all_texts += text_after
one_text_after.append(text_after)
else:
text = text[0]
text_iterator = get_pre_process_text_ds_iter(text,
pre_process_pipeline,
text_iterator = get_pre_process_text_ds_iter(text, pre_process_pipeline,
self.num_parallel_calls,
self.batch_size)
text_after_arr = self.run_dataset(text_iterator, batch_num)
Expand All @@ -147,22 +146,26 @@ def prepare_one_raw_data(self, one_path, one_path_after, mode,
label_ds = label[i].batch(self.batch_size)
label_iterator = label_ds.make_initializable_iterator()
label_after_arr = self.run_dataset(label_iterator, batch_num)
label_after_one = [one_line.decode("utf-8") for one_line in label_after_arr]
label_after_one = [
one_line.decode("utf-8") for one_line in label_after_arr
]
one_label_after.append(label_after_one)
all_labels[i] += label_after_one
else:
label = label[0]
label_ds = label.batch(self.batch_size)
label_iterator = label_ds.make_initializable_iterator()
label_after_arr = self.run_dataset(label_iterator, batch_num)
one_label_after = [one_line.decode("utf-8") for one_line in label_after_arr]
one_label_after = [
one_line.decode("utf-8") for one_line in label_after_arr
]
all_labels += one_label_after

logging.debug(f"one_text_after: {len(one_text_after)}")
self.save_a_raw_file(one_label_after, one_text_after, one_path_after,
infer_without_label)

def run_dataset(self, data_iterator,batch_num):
def run_dataset(self, data_iterator, batch_num):
"""Run the text pre-process pipeline, fetch data in numpy array format."""
data_after = []
data_t = data_iterator.get_next()
Expand All @@ -176,7 +179,6 @@ def run_dataset(self, data_iterator,batch_num):
data_after_arr = np.concatenate(data_after, axis=0)
return data_after_arr


def load_a_raw_file(self, one_path, infer_without_label):
"""
Load a raw file. Return text and label.
Expand Down
4 changes: 1 addition & 3 deletions delta/data/preprocess/text_cls_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from delta.data import utils as data_utils
from delta.utils.register import registers


# pylint: disable=too-many-instance-attributes


Expand All @@ -46,13 +45,12 @@ def load_a_raw_file(self, one_path, infer_without_label):
ds_list = load_textline_dataset(one_path, column_num)
if infer_without_label:
text = ds_list
label = [] #to modifiy
label = [] #to modifiy
else:
text = ds_list[1:]
label = ds_list[:1]
return (text, label)


def save_a_raw_file(self, label, text_after, one_path_after,
infer_without_label):
"""Save a raw file."""
Expand Down
6 changes: 3 additions & 3 deletions delta/data/preprocess/text_match_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,9 @@ def load_a_raw_file(self, one_path, infer_without_label):
"""

if infer_without_label:
column_num=2
column_num = 2
else:
column_num=3
column_num = 3

ds_list = load_textline_dataset([one_path], column_num)
if infer_without_label:
Expand All @@ -52,7 +52,7 @@ def load_a_raw_file(self, one_path, infer_without_label):
text = ds_list[1:]
label = ds_list[:1]

return (text,label)
return (text, label)

def save_a_raw_file(self, label, text_after, one_path_after,
infer_without_label):
Expand Down
68 changes: 33 additions & 35 deletions delta/data/preprocess/text_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,26 +55,23 @@ def tokenize_sentence(texts, max_seq_len, vocab_path):
def chinese_word_cut_tf(input_str, use_file=False):
""""""
main_root = os.environ["MAIN_ROOT"]
dict_path = os.path.join(main_root,
"tools/cppjieba/dict/jieba.dict.utf8")
hmm_path = os.path.join(main_root,
"tools/cppjieba/dict/hmm_model.utf8")
user_dict_path = os.path.join(main_root,
"tools/cppjieba/dict/user.dict.utf8")
dict_path = os.path.join(main_root, "tools/cppjieba/dict/jieba.dict.utf8")
hmm_path = os.path.join(main_root, "tools/cppjieba/dict/hmm_model.utf8")
user_dict_path = os.path.join(main_root, "tools/cppjieba/dict/user.dict.utf8")
idf_path = os.path.join(main_root, "tools/cppjieba/dict/idf.utf8")
stop_word_path = os.path.join(main_root,
"tools/cppjieba/dict/stop_words.utf8")

if use_file:
output_str = py_x_ops.jieba_cut(
input_str,
use_file=True,
hmm=True,
dict_path=dict_path,
hmm_path=hmm_path,
user_dict_path=user_dict_path,
idf_path=idf_path,
stop_word_path=stop_word_path)
input_str,
use_file=True,
hmm=True,
dict_path=dict_path,
hmm_path=hmm_path,
user_dict_path=user_dict_path,
idf_path=idf_path,
stop_word_path=stop_word_path)
else:
dict_lines = read_lines_from_text_file(dict_path)
model_lines = read_lines_from_text_file(hmm_path)
Expand All @@ -83,14 +80,14 @@ def chinese_word_cut_tf(input_str, use_file=False):
stop_word_lines = read_lines_from_text_file(stop_word_path)

output_str = py_x_ops.jieba_cut(
input_str,
use_file=False,
hmm=True,
dict_lines=dict_lines,
model_lines=model_lines,
user_dict_lines=user_dict_lines,
idf_lines=idf_lines,
stop_word_lines=stop_word_lines)
input_str,
use_file=False,
hmm=True,
dict_lines=dict_lines,
model_lines=model_lines,
user_dict_lines=user_dict_lines,
idf_lines=idf_lines,
stop_word_lines=stop_word_lines)
return output_str


Expand Down Expand Up @@ -136,7 +133,8 @@ def char_cut_tf(input_str):
def load_textline_dataset(paths, column_num):
"""Load raw data for text task."""
ds = tf.data.TextLineDataset(paths)
ds = ds.map(lambda x: tf.strings.split(x, sep="\t", result_type="RaggedTensor"))
ds = ds.map(
lambda x: tf.strings.split(x, sep="\t", result_type="RaggedTensor"))
ds = ds.filter(lambda line: tf.equal(tf.size(line), column_num))
ds_list = []
for i in range(column_num):
Expand All @@ -162,13 +160,13 @@ def process_one_label_dataset(label_ds, config, output_index=None):
label_vocab_file_path = config["data"]["task"]["label_vocab"]

label_ds = label_ds.map(
lambda x: tokenize_label(
x, maxlen=1, label_vocab_file_path=label_vocab_file_path, pad_id=0),
num_parallel_calls=num_parallel_calls)
lambda x: tokenize_label(
x, maxlen=1, label_vocab_file_path=label_vocab_file_path, pad_id=0),
num_parallel_calls=num_parallel_calls)

label_ds = label_ds.map(
lambda l: tf.one_hot(l, num_classes, dtype=tf.int32),
num_parallel_calls=num_parallel_calls)
lambda l: tf.one_hot(l, num_classes, dtype=tf.int32),
num_parallel_calls=num_parallel_calls)

label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls)

Expand All @@ -185,7 +183,7 @@ def process_multi_label_dataset(label_ds, config, output_index=None):
label_vocab_file_path = config["data"]["task"]["label_vocab"]
if isinstance(label_vocab_file_path, list):
if output_index is None or output_index not in range(
len(label_vocab_file_path)):
len(label_vocab_file_path)):
raise IndexError("output_index:{} not in the range of classes length: "
"{}!".format(output_index, len(label_vocab_file_path)))
label_vocab_file_path = label_vocab_file_path[output_index]
Expand All @@ -194,12 +192,12 @@ def process_multi_label_dataset(label_ds, config, output_index=None):
label_vocab_file_path = label_vocab_file_path

label_ds = label_ds.map(
lambda x: tokenize_label(
x,
maxlen=max_seq_len,
label_vocab_file_path=label_vocab_file_path,
pad_id=0),
num_parallel_calls=num_parallel_calls)
lambda x: tokenize_label(
x,
maxlen=max_seq_len,
label_vocab_file_path=label_vocab_file_path,
pad_id=0),
num_parallel_calls=num_parallel_calls)
label_ds = label_ds.map(tf.squeeze, num_parallel_calls=num_parallel_calls)

return label_ds
Expand Down
9 changes: 5 additions & 4 deletions delta/data/preprocess/text_seq2seq_preparer.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ def prepare_raw_data(self, pre_process_pipeline):
self.prepare_one_raw_data((one_path_text, one_path_target),
(one_path_text_after, one_path_target_after),
mode, infer_without_label,
pre_process_pipeline, all_texts, all_labels,data_size)
pre_process_pipeline, all_texts, all_labels,
data_size)
return all_texts, all_labels

def load_a_raw_file(self, one_path, infer_without_label):
Expand All @@ -79,10 +80,10 @@ def load_a_raw_file(self, one_path, infer_without_label):
column_num = 1
text_path, target_path = one_path
texts = load_textline_dataset([text_path], column_num)
# texts = data_utils.load_seq2seq_raw_data([text_path])
# texts = data_utils.load_seq2seq_raw_data([text_path])
if not infer_without_label:
target = load_textline_dataset([target_path],column_num)
return texts+target, target
target = load_textline_dataset([target_path], column_num)
return texts + target, target
return texts, []

def save_a_raw_file(self, label, text_after, one_path_after,
Expand Down
1 change: 0 additions & 1 deletion delta/data/task/base_text_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ def __init__(self, config, mode):
self.shuffle_buffer_size = self.task_config['shuffle_buffer_size']
self.need_shuffle = self.task_config['need_shuffle']


def input_fn(self):

def _input_fn():
Expand Down
1 change: 1 addition & 0 deletions delta/data/task/speaker_cls_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -583,6 +583,7 @@ def make_example(inputs, labels, filenames, clip_ids, soft_labels):
batch(batch_size, drop_remainder=False).\
prefetch(tf.data.experimental.AUTOTUNE)


class KaldiDir:

def __init__(self, kaldi_dir):
Expand Down
9 changes: 5 additions & 4 deletions delta/data/task/text_cls_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ class TextClsTask(TextTask):
def __init__(self, config, mode):
super().__init__(config, mode)
self.infer_no_label = self.config["data"][utils.INFER].get(
'infer_no_label', False)
'infer_no_label', False)
self.vocab_min_frequency = self.task_config['vocab_min_frequency']
self.text_vocab_file_path = self.task_config['text_vocab']
self.label_vocab_file_path = self.task_config['label_vocab']
Expand All @@ -68,7 +68,8 @@ def generate_data(self):
text_ds = load_textline_dataset(self.paths_after_pre_process, column_num)
else:
column_num = 2
label_ds, text_ds = load_textline_dataset(self.paths_after_pre_process, column_num)
label_ds, text_ds = load_textline_dataset(self.paths_after_pre_process,
column_num)

input_pipeline_func = self.get_input_pipeline(for_export=False)

Expand Down Expand Up @@ -106,8 +107,8 @@ def generate_data(self):
self.split_token))
self.config['data']['split_token'] = int(vocab_dict[self.split_token])
self.config['data']['vocab_size'] = vocab_size
self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(self.paths_after_pre_process)

self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
self.paths_after_pre_process)

return data_set

Expand Down
3 changes: 2 additions & 1 deletion delta/data/task/text_cls_task_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,8 @@ def test_chinese_word(self):
shape_op = tf.shape(input_x)

with self.cached_session(use_gpu=False, force_gpu=False) as sess:
res, shape_res = sess.run([input_x, shape_op], feed_dict={input_sentence: ["我很愤怒"]})
res, shape_res = sess.run([input_x, shape_op],
feed_dict={input_sentence: ["我很愤怒"]})
logging.debug(res[0])
logging.debug(np.shape(res[0]))
logging.debug(f"shape: {shape_res}")
Expand Down
15 changes: 9 additions & 6 deletions delta/data/task/text_match_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def __init__(self, config, mode):
one_path + ".after" for one_path in self.paths
]
self.infer_no_label = self.config["data"][utils.INFER].get(
'infer_no_label', False)
'infer_no_label', False)
self.infer_without_label = bool(mode == utils.INFER and self.infer_no_label)

self.prepare()
Expand All @@ -57,11 +57,13 @@ def __init__(self, config, mode):
def generate_data(self):
"""Generate data for offline training."""
if self.infer_without_label:
column_num=2
text_ds_left, text_ds_right = load_textline_dataset(self.paths_after_pre_process, column_num)
column_num = 2
text_ds_left, text_ds_right = load_textline_dataset(
self.paths_after_pre_process, column_num)
else:
column_num=3
label,text_ds_left, text_ds_right=load_textline_dataset(self.paths_after_pre_process, column_num)
column_num = 3
label, text_ds_left, text_ds_right = load_textline_dataset(
self.paths_after_pre_process, column_num)

input_pipeline_func = self.get_input_pipeline(for_export=False)
text_ds_left = text_ds_left.map(
Expand All @@ -86,7 +88,8 @@ def generate_data(self):
vocab_size = len(vocab_dict)

self.config['data']['vocab_size'] = vocab_size
self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(self.paths_after_pre_process)
self.config['data']['{}_data_size'.format(self.mode)] = get_file_len(
self.paths_after_pre_process)

return data_set_left_right, text_len_left_right

Expand Down
Loading