-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmix_bthplusbackground.m
201 lines (161 loc) · 7.12 KB
/
mix_bthplusbackground.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
clear
fs = 16000;
pow_thresh=-20; % To test recording failure
addpath ../utils;
upath='../../data/audio/16kHz/isolated/'; % path to segmented utterances
cpath='../../data/audio/16kHz/embedded/'; % path to continuous recordings
bpath='../../data/audio/16kHz/backgrounds/'; % path to noise backgrounds
apath='../../data/annotations/'; % path to JSON annotations
datapath = '/scratch/mensa/xiali/CHiME3/data/audio/16kHz/NBDF/';
sets = {'tr','dt','et'};
ori_spksets = {'M01 M02 F02 F03','M03 M04 F01 F04','M05 F05 M06 F06'}; % CHiME3 original speaker sets for training, development and test, respectively
%% CHiME3 background noise, bth utterances %%
% background nosie
blist = dir([bpath '*.CH1.wav']);
% bth utterances
bth_tr_mat=json2mat([apath 'tr05_bth.json']);
bth_dt_mat=json2mat([apath 'dt05_bth.json']);
bth_et_mat=json2mat([apath 'et05_bth.json']);
bth_mat = [bth_tr_mat,bth_dt_mat,bth_et_mat];
%% Speakers and background noise used for train, validation and test of narrow-band deep filtering network %%
tr_spks = 'M01 M02 F02 F03'; % Speakers used for training of narrow-band deep filtering network
dt_spks = 'M01 M02 F02 F03';
et_spks = 'F01 M03 M04 F04 M05 F05 M06 F06';
tr_nt = 'BUS CAF PED STR'; % Noise types used for training of narrow-band deep filtering network
dt_nt = 'BUS CAF PED STR';
et_nt = 'BUS CAF PED STR';
% Collect utterances into tr_utt, dt_utt and et_utt for train, validation and test, respectively.
% Similarly, noise sessions are collected into *_noise
for setindx = 1:length(sets)
set = sets{setindx};
eval([set '_utt={};']);
eval([set '_noise=[];']);
end
for uind = 1:length(bth_mat)
for setindx = 1:length(sets)
set = sets{setindx};
eval(['iscon=contains(' set '_spks,bth_mat{uind}.speaker);']);
if iscon
eval([set '_utt(end+1)= bth_mat(uind);']);
end
end
end
for nind = 1:length(blist)
nname = blist(nind).name(end-10:end-8);
for setindx = 1:length(sets)
set = sets{setindx};
eval(['iscon=contains(' set '_nt,nname);']);
if iscon
if isempty(eval([set '_noise']))
eval([set '_noise= blist(nind);']);
else
eval([set '_noise(end+1)= blist(nind);']);
end
end
end
end
%% Generate mixed data
CHAN = [3 4 5 6];
refChan = 6;
refIndx = find(CHAN==refChan);
nchan = length(CHAN);
% output folders
outfolders = {'train_mixed_wav','validation_mixed_wav','test_mixed_wav'};
% For data augmentation, each clean utterance is mixed with multiple randomly selected background noise segments
ITE = [15,3,5];
% Randomly selceted multiple clean utterances are first concatenated, then
% from which extract training/validation/test sequences
% Utterances are speaked by different speakers and from different
% locations, thence training with concatenated signals takes speech/speaker
% turns into account
UTTCONC = [10,10,1];
% First 60% noise recording of each session is used for train/validation, and last 40% for test
Npoint = [0 0.6; 0 0.6;0.6 1];
% Mixing SNRs
TRAINSNR = [-5,10]; % Train/validation snr range
TESTSNR = -4:4:8; % Test SNRs
% Generate mixed train/validation/test data
for i = 1:3
set = sets{i};
ite = ITE(i);
uttconc = UTTCONC(i);
eval(['noise_session=',set,'_noise;']);
bn = length(noise_session);
outpath = [datapath outfolders{i}];
if ~exist(outpath,'dir')
system(['mkdir -p ' outpath]);
end
for j=1:ite
fprintf([set ' data: ' num2str(j) '/' num2str(ite) ' \n'])
eval(['clean_utt=',set,'_utt;']);
clean_utt = clean_utt(randperm(length(clean_utt))); % shuffle clean utterances
%
for utt_indx = 1:uttconc:length(clean_utt)
% Clean utterances concatenation
s = [];
for utt_ind = utt_indx:min(utt_indx+uttconc-1,length(clean_utt))
ori_spkset = find(contains(ori_spksets,clean_utt{utt_ind}.speaker));
oname=[clean_utt{utt_ind}.speaker '_' clean_utt{utt_ind}.wsj_name '_BTH'];
su = [];
c = 0;
for ch = CHAN
c = c+1;
su(:,c) = audioread([upath sets{ori_spkset} '05_bth/' oname '.CH' num2str(ch) '.wav']);
end
spow=sum(su.^2,1);
spow=10*log10(spow/max(spow));
sfail=any(spow<=pow_thresh);
if sfail
continue;
end
s = [s;su];
end
sdur = length(s);
if sdur == 0
continue;
end
% Extract background noise
nfail = true;
while nfail
bname=noise_session(randperm(bn,1)).name(1:end-8); % draw a random background recording
bb=audioinfo([bpath bname '.CH1.wav']);
bdur=bb.Duration*fs;
setbeg = round(Npoint(i,1)*bdur);
setend = round(Npoint(i,2)*bdur)-sdur;
nbeg = round(rand(1)*(setend-setbeg))+1;
nbeg = setbeg+nbeg;
nend = nbeg+sdur-1;
n=zeros(sdur,nchan);
c = 0;
for ch=CHAN
c = c+1;
n(:,c)=audioread([bpath bname '.CH' int2str(ch) '.wav'],[nbeg nend]);
end
npow=sum(n.^2,1);
npow=10*log10(npow/max(npow));
nfail=any(npow<=pow_thresh);
end
envir = lower(bname(end-2:end));
if i <= 2 % training and validation data
snrdb = rand(1)*(TRAINSNR(2)-TRAINSNR(1))+TRAINSNR(1);
snr = 10^(snrdb/10);
s=sqrt(snr/sum(sum(s.^2))*sum(sum(n.^2)))*s;
x=s+n;
x(:,end+1) = s(:,refIndx); % multichannel noisy signal and reference channel clean speech
audiowrite([outpath '/' envir '_' num2str(snrdb) 'dB.wav'],x/max(max(abs(x))),fs);
else % test data
snrdb = TESTSNR(randperm(length(TESTSNR),1));
snr = 10^(snrdb/10);
s=sqrt(snr/sum(sum(s.^2))*sum(sum(n.^2)))*s;
x=s+n;
udir=[outpath '/' envir '/' 'snr' num2str(snrdb) '/'];
if ~exist(udir,'dir')
system(['mkdir -p ' udir]);
end
audiowrite([udir oname(1:end-4) '_ms.wav'],x/max(max(abs(x))),fs);
audiowrite([udir oname(1:end-4) '_refms.wav'],x(:,refIndx)/max(abs(x(:,refIndx))),fs);
audiowrite([udir oname(1:end-4) '_cln.wav'],s(:,refIndx)/max(abs(s(:,refIndx))),fs);
end
end
end
end