-
Notifications
You must be signed in to change notification settings - Fork 18
/
dsutils.py
124 lines (106 loc) · 4.5 KB
/
dsutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
import itertools as itools
from DataSequence import DataSequence
DEFAULT_BAD_WORDS = frozenset(["sentence_start", "sentence_end", "br", "lg", "ls", "ns", "sp"])
def make_word_ds(grids, trfiles, bad_words=DEFAULT_BAD_WORDS):
"""Creates DataSequence objects containing the words from each grid, with any words appearing
in the [bad_words] set removed.
"""
ds = dict()
stories = grids.keys()
for st in stories:
grtranscript = grids[st].tiers[1].make_simple_transcript()
## Filter out bad words
goodtranscript = [x for x in grtranscript
if x[2].lower().strip("{}").strip() not in bad_words]
d = DataSequence.from_grid(goodtranscript, trfiles[st][0])
ds[st] = d
return ds
def make_phoneme_ds(grids, trfiles):
"""Creates DataSequence objects containing the phonemes from each grid.
"""
ds = dict()
stories = grids.keys()
for st in stories:
grtranscript = grids[st].tiers[0].make_simple_transcript()
d = DataSequence.from_grid(grtranscript, trfiles[st][0])
ds[st] = d
return ds
phonemes = ['AA', 'AE','AH','AO','AW','AY','B','CH','D',
'DH', 'EH', 'ER', 'EY', 'F', 'G', 'HH', 'IH', 'IY', 'JH',
'K', 'L', 'M', 'N', 'NG', 'OW', 'OY', 'P', 'R', 'S', 'SH',
'T', 'TH', 'UH', 'UW', 'V', 'W', 'Y', 'Z', 'ZH']
def make_character_ds(grids, trfiles):
ds = dict()
stories = grids.keys()
for st in stories:
grtranscript = grids[st].tiers[2].make_simple_transcript()
fixed_grtranscript = [(s,e,map(int, c.split(","))) for s,e,c in grtranscript if c]
d = DataSequence.from_grid(fixed_grtranscript, trfiles[st][0])
ds[st] = d
return ds
def make_dialogue_ds(grids, trfiles):
ds = dict()
for st, gr in grids.iteritems():
grtranscript = gr.tiers[3].make_simple_transcript()
fixed_grtranscript = [(s,e,c) for s,e,c in grtranscript if c]
ds[st] = DataSequence.from_grid(fixed_grtranscript, trfiles[st][0])
return ds
def histogram_phonemes(ds, phonemeset=phonemes):
"""Histograms the phonemes in the DataSequence [ds].
"""
olddata = ds.data
N = len(ds.data)
newdata = np.zeros((N, len(phonemeset)))
phind = dict(enumerate(phonemeset))
for ii,ph in enumerate(olddata):
try:
#ind = phonemeset.index(ph.upper().strip("0123456789"))
ind = phind[ph.upper().strip("0123456789")]
newdata[ii][ind] = 1
except Exception as e:
pass
return DataSequence(newdata, ds.split_inds, ds.data_times, ds.tr_times)
def histogram_phonemes2(ds, phonemeset=phonemes):
"""Histograms the phonemes in the DataSequence [ds].
"""
olddata = np.array([ph.upper().strip("0123456789") for ph in ds.data])
newdata = np.vstack([olddata==ph for ph in phonemeset]).T
return DataSequence(newdata, ds.split_inds, ds.data_times, ds.tr_times)
def make_semantic_model(ds, lsasm):
newdata = []
for w in ds.data:
try:
v = lsasm[w]
except KeyError as e:
v = np.zeros((lsasm.data.shape[0],))
newdata.append(v)
return DataSequence(np.array(newdata), ds.split_inds, ds.data_times, ds.tr_times)
def make_character_model(dss):
"""Make character indicator model for a dict of datasequences.
"""
stories = dss.keys()
storychars = dict([(st,np.unique(np.hstack(ds.data))) for st,ds in dss.iteritems()])
total_chars = sum(map(len, storychars.values()))
char_inds = dict()
ncharsdone = 0
for st in stories:
char_inds[st] = dict(zip(storychars[st], range(ncharsdone, ncharsdone+len(storychars[st]))))
ncharsdone += len(storychars[st])
charmodels = dict()
for st,ds in dss.iteritems():
charmat = np.zeros((len(ds.data), total_chars))
for ti,charlist in enumerate(ds.data):
for char in charlist:
charmat[ti, char_inds[st][char]] = 1
charmodels[st] = DataSequence(charmat, ds.split_inds, ds.data_times, ds.tr_times)
return charmodels, char_inds
def make_dialogue_model(ds):
return DataSequence(np.ones((len(ds.data),1)), ds.split_inds, ds.data_times, ds.tr_times)
def modulate(ds, vec):
"""Multiplies each row (each word/phoneme) by the corresponding value in [vec].
"""
return DataSequence((ds.data.T*vec).T, ds.split_inds, ds.data_times, ds.tr_times)
def catmats(*seqs):
keys = seqs[0].keys()
return dict([(k, DataSequence(np.hstack([s[k].data for s in seqs]), seqs[0][k].split_inds)) for k in keys])