-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalysis.py
123 lines (109 loc) · 4.77 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import librosa, numpy as np, scipy.io.wavfile as sio_wav, json
from scipy import signal
def load_wav(file_path, fs) -> np.ndarray:
"""
Load a wav file.
:param file_path: the path of the wav file
:param fs: sampling frequency (Hz)
:return: array, [-1, 1]
"""
assert file_path[-3:] == "wav", "[!] Only .wav file can be read.\n{}".format(file_path)
wav, _ = librosa.core.load(file_path, fs)
assert isinstance(wav, np.ndarray)
return wav
def save_wav(file_path, wav, fs, norm=True):
"""
Save a wav to destination[file_path].
:param file_path: the path of the wav file
:param wav: np_array, shape := (time_step, ...)
:param fs: sampling frequency (Hz)
:param norm: if True [default], the wav is normalized to [-1, 1].
:return: None
"""
assert file_path[-3:] == "wav", "[!] Only .wav type is supported.\n{}".format(file_path)
if norm:
wav /= np.max(np.abs(wav))
sio_wav.write(file_path, fs, wav.astype("float32"))
def load_from_json(file_path, tool_cls):
"""
Build analysis object from json file.
:param file_path: json file path, which contains the meta information of a analysis tool
:param tool_cls: tool class
:return: a tool class object
"""
with open(file_path, "r") as f:
meta = json.load(f)
return tool_cls(**meta)
def pre_emphasis(wav, alpha=0.97):
return signal.lfilter([1, -alpha], [1], wav)
def de_emphasis(wav, alpha=0.97):
return signal.lfilter([1], [1, -alpha], wav)
def get_stft_mag(wav, n_fft, frame_shift_dots, frame_length_dots, window_type="hann", **kargs):
tmp = np.abs(librosa.core.stft(wav, n_fft, frame_shift_dots, frame_length_dots, window_type))
return tmp.T
def get_mel(stft_m, n_mels=80):
tmp = librosa.feature.melspectrogram(S=np.square(stft_m.T), n_mels=n_mels)
return tmp.T
class AnalysisToolBase(object):
def __init__(self, fs, frame_shift, frame_length, n_fft, window_type):
frame_shift_dots = int(frame_shift * fs)
frame_length_dots = int(frame_length * fs)
self.meta = dict(fs=fs, frame_shift=frame_shift, frame_length=frame_length,
frame_shift_dots=frame_shift_dots, frame_length_dots=frame_length_dots,
n_fft=n_fft, window_type=window_type)
def save_as_json(self, file_path):
with open(file_path, "w") as f:
json.dump(self.meta, f)
class GLA(AnalysisToolBase):
"""Griffin-Lim Vocoder
"""
def __init__(self, *args, n_mels=80, pre_emphasis_coef=0.97):
"""
:param args:
:param pre_emphasis_coef:
"""
super(GLA, self).__init__(*args)
self.meta['n_mels'] = n_mels
self.meta['pre_emphasis'] = pre_emphasis_coef
self.__mel_filter = librosa.filters.mel(sr=self.meta.get('fs'),
n_fft=self.meta.get('n_fft'), n_mels=self.meta.get('n_mels'))
self.__mel_filter = np.transpose(self.__mel_filter, (1, 0))
def extract(self, file_path):
"""
Extract spectrogram and mel_spectrogram.
:param file_path: wav file path
:return: a list := [mel_spectrogram, spectrogram]
"""
wav = pre_emphasis(load_wav(file_path, self.meta.get('fs')), self.meta.get('pre_emphasis'))
stft_m = get_stft_mag(wav, **self.meta)
mel = np.matmul(stft_m, self.__mel_filter)
return [mel, stft_m]
def synthesis(self, stft_m, aug_by_power=1.2, max_iter=50, norm=True):
"""
Synthesize wave from spectrogram
:param stft_m: spectrogram, shape := (time_step, ...)
:param aug_by_power:
:param max_iter:
:param norm:
:return:
"""
aug_stft_m = np.power(stft_m.T, aug_by_power)
wav_dots = (len(aug_stft_m) - 1) * self.meta.get('frame_shift_dots')
wav = np.random.uniform(low=-1., high=1., size=(wav_dots,))
for idx in range(max_iter):
spec_complex =self.__stft(wav)
spec_complex = aug_stft_m * spec_complex / np.abs(spec_complex)
wav = self.__inv_stft(spec_complex)
wav = de_emphasis(wav, self.meta.get('pre_emphasis'))
if norm:
wav /= np.max(np.abs(wav))
return wav
def __stft(self, y):
return librosa.core.stft(y, n_fft=self.meta.get('n_fft'),
hop_length=self.meta.get('frame_shift_dots'),
win_length=self.meta.get('frame_length_dots'),
window=self.meta.get('window_type'))
def __inv_stft(self, spec):
return librosa.core.istft(spec, window=self.meta.get('window_type'),
hop_length=self.meta.get('frame_shift_dots'),
win_length=self.meta.get('frame_length_dots'))