-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfeatures_extraction.py
265 lines (232 loc) · 10.3 KB
/
features_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import os, librosa, numpy as np, pandas as pd, wave
from pydub import AudioSegment
from gammatone.gtgram import gtgram
from glob import glob
from scipy.fftpack import dct
from matplotlib import pyplot as plt
from spafe.features.bfcc import bfcc
from spafe.utils.preprocessing import SlidingWindow
from constants import (freq, classes, output_folder_name_converter,
channels, segment_folder, output_file, skipped_file,
y_name, path_name, sec_split, n_ceps, overlap, max_0_perc_allowed)
# Tunable
mfcc_axis = 1
gfcc_axis = 1
bfcc_axis = 1
sr = freq
def get_wav_duration(file_path):
with wave.open(file_path, 'rb') as wav_file:
frames = wav_file.getnframes()
rate = wav_file.getframerate()
duration = frames / float(rate)
return duration
def compute_0_percentage(y):
total_elements = y.size
zero_count = np.count_nonzero(y == 0)
zero_percentage = zero_count / total_elements
return zero_percentage
# Normalizzazione tramite padding
def load_and_normalize_audio(file_path, sr=44100, max_length=40000):
y, sr = librosa.load(file_path, sr=sr)
len_y = len(y)
print(f"Audio is len {len_y}")
if len_y < max_length:
print("Padding...")
y = np.pad(y, (0, max_length - len(y)), mode='constant')
else:
y = y[:max_length]
return y, sr
# Normalizzazione tramite segmentazione
def load_and_segment_audio(file_path, segment_duration=10, overlap=0.5, sr=44100, pad_shorter=True,
allowed_0_perc=None, check_only_padded=True):
def add_to_list(segments_list, segment, idx, allowed_0_perc, check_only_padded, is_padded):
if check_only_padded:
if is_padded:
if allowed_0_perc is None:
segments_list.append(segment)
else:
perc_of_0 = compute_0_percentage(y=segment)
if perc_of_0 <= allowed_0_perc:
segments_list.append(segment)
else:
print(f"Segment {idx} Of {file_path} Dropped Cause Too Many 0s. {perc_of_0} on {allowed_0_perc} Allowed")
else:
segments_list.append(segment)
else:
if allowed_0_perc is None:
segments_list.append(segment)
else:
perc_of_0 = compute_0_percentage(y=segment)
if perc_of_0 <= allowed_0_perc:
segments_list.append(segment)
else:
print(
f"Segment {idx} Of {file_path} Dropped Cause Too Many 0s. {perc_of_0} on {allowed_0_perc} Allowed")
return segments_list
y, sr = librosa.load(file_path, sr=sr)
audio_duration = get_wav_duration(file_path=file_path)
print(f"Audio {file_path} Is {audio_duration} sec. Len")
is_padded = False
if audio_duration < segment_duration:
if pad_shorter:
print(f"Padding Audio {file_path} To Desired Duration")
y = librosa.util.fix_length(data=y, size=int(segment_duration * sr))
is_padded = True
else:
print(f"Audio {file_path} Too Short, Skipped.")
return None, sr
segment_length = int(segment_duration * sr)
step_length = int(segment_length * (1 - overlap))
segments = []
idx = 0
for start in range(0, len(y), step_length):
end = start + segment_length
if end <= len(y):
segment = y[start:end]
segments = add_to_list(segments_list=segments, segment=segment, idx=idx, allowed_0_perc=allowed_0_perc,
check_only_padded=check_only_padded, is_padded=is_padded)
else:
segment = np.pad(y[start:end], (0, segment_length - len(y[start:end])), mode='constant')
segments = add_to_list(segments_list=segments, segment=segment, idx=idx, allowed_0_perc=allowed_0_perc,
check_only_padded=check_only_padded, is_padded=is_padded)
break
idx += 1
segments = None if len(segments) == 0 else segments
return segments, sr
def save_segments_as_mp3(segments, sr, output_dir, base_filename):
os.makedirs(f"{output_dir}/{base_filename}", exist_ok=True)
for i, segment in enumerate(segments):
# Convert numpy array to waves segment
audio_segment = AudioSegment(
segment.tobytes(),
frame_rate=sr,
sample_width=segment.dtype.itemsize,
channels=channels
)
# Save as MP3
output_path = f"{output_dir}/{base_filename}/{base_filename}_segment_{i + 1}.mp3"
audio_segment.export(output_path, format="mp3")
print(f"Saved segment {i + 1} to {output_path}")
def compute_mfcc_features(y, sr):
'''
axis=1: Ogni elemento del vettore rappresenta la media dei valori di un singolo coefficiente MFCC attraverso tutto il segnale waves.
Fornisce un'idea generale della distribuzione dei coefficienti MFCC per l'intero waves. Dim Output = n_mfcc
axis=0: Ogni elemento del vettore rappresenta la media dei valori di tutti i coefficienti MFCC per un singolo frame temporale.
Fornisce un'idea di come le caratteristiche spettrali cambiano nel tempo. Dim Output = n_t.
Parametro Tunabile
'''
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_ceps)
mfcc_mean = np.mean(mfccs, axis=mfcc_axis)
# mfcc_std = np.std(mfccs, axis=0)
cols_mfcc = [f"mfcc_mean_{i}" for i, _ in enumerate(mfcc_mean)]
mfcc_df = pd.DataFrame([mfcc_mean])
mfcc_df.columns = cols_mfcc
return mfcc_df
def compute_gfcc_features(y, sr):
# Metodo 1
n_filters = 64 # Numero di filtri Gammatone
win_size = 0.05 # Durata della finestra in secondi
hop_size = 0.02 # Durata dello step in secondi
f_min = 50 # Dal digramma di mel le frequenze sono molto alte quindi si può alzare questo valore con perdita minima
gammatone_spec = gtgram(y, sr, win_size, hop_size, n_filters, f_min)
log_gammatone_spec = np.log(gammatone_spec + 1e-6)
gfccs = dct(log_gammatone_spec, type=2, axis=0, norm='ortho')[:n_ceps]
gfcc_mean = np.mean(gfccs, axis=gfcc_axis)
# gfcc_std = np.std(gfccs, axis=0)
cols_gfcc = [f"gfcc_mean_{i}" for i, _ in enumerate(gfcc_mean)]
gfcc_df = pd.DataFrame([gfcc_mean])
gfcc_df.columns = cols_gfcc
# Metodo 2
# gfccs = gfcc(y, fs=sr, num_ceps=13)
# print(gfccs.shape)
return gfcc_df
def compute_bfcc_features(y, sr):
bfccs = bfcc(y, fs=sr, pre_emph=1, pre_emph_coeff=0.97, num_ceps=n_ceps,
window=SlidingWindow(0.03, 0.015, "hamming"), low_freq=0, nfilts=50,
high_freq=16000, normalize="mvn").T
bfcc_mean = np.mean(bfccs, axis=bfcc_axis)
cols_bfcc = [f"bfcc_mean_{i}" for i, _ in enumerate(bfcc_mean)]
bfcc_df = pd.DataFrame([bfcc_mean])
bfcc_df.columns = cols_bfcc
return bfcc_df
# Funzione per estrarre caratteristiche
def extract_features(y, sr):
print("Computing MFCC Features")
mfcc_df = compute_mfcc_features(y=y, sr=sr)
print("MFCC Shape", mfcc_df.shape)
print("---------------------------")
print("Computing GFCC Features")
gfcc_df = compute_gfcc_features(y=y, sr=sr)
print("GFCC Shape", gfcc_df.shape)
print("---------------------------")
print("Computing BFCC Features")
bfcc_df = compute_bfcc_features(y=y, sr=sr)
print("BFCC Shape", bfcc_df.shape)
print("---------------------------")
result = pd.concat([mfcc_df, gfcc_df, bfcc_df], axis=1)
# Other features that i can build.
'''
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
zero_crossings = librosa.feature.zero_crossing_rate(y)
# Media e deviazione standard
mfcc_mean = np.mean(mfccs, axis=1)
mfcc_std = np.std(mfccs, axis=1)
chroma_mean = np.mean(chroma, axis=1)
chroma_std = np.std(chroma, axis=1)
zero_crossings_mean = np.mean(zero_crossings)
zero_crossings_std = np.std(zero_crossings)
# Concatenare tutte le caratteristiche
features = np.concatenate([mfcc_mean, mfcc_std, chroma_mean, chroma_std, [zero_crossings_mean, zero_crossings_std]])
return features
'''
return result
def plot_mel_spectogram(y):
S = librosa.feature.melspectrogram(y=y, sr=sr)
S_dB = librosa.power_to_db(S, ref=np.max)
plt.figure(figsize=(10, 6))
librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Spettrogramma di Mel')
plt.show()
split_audio = True
process_segments = True
if split_audio:
for file_class in classes:
for audio in glob(f"{output_folder_name_converter}{file_class}/*.wav"):
print(f"Splitting {audio}")
base_name = audio.split("/")[-1].split(".")[0]
segments, _ = load_and_segment_audio(file_path=audio, sr=sr, segment_duration=sec_split,
overlap=overlap, pad_shorter=True, allowed_0_perc=max_0_perc_allowed)
if segments is not None:
save_segments_as_mp3(segments=segments, sr=sr, output_dir=segment_folder + "/" + file_class,
base_filename=base_name)
else:
print(f"Audio {audio} Ignored.")
if process_segments:
dfs = []
skipped = []
errors = []
subfolders = [x[0] for x in os.walk(segment_folder)]
len_sub = len(subfolders)
for i, folder in enumerate(subfolders):
print(f"Processing Folder {folder}. {i + 1}/{len_sub}")
val = folder.split("/")
sample_class = set(val).intersection(set(classes))
files = glob(folder + "/*.mp3")
len_files = len(files)
for j, segment in enumerate(files):
print(f"Processing File {segment}. {j + 1}/{len_files}")
try:
y, sr = librosa.load(segment, sr=sr)
df = extract_features(y=y, sr=sr)
df[path_name] = segment
df[y_name] = list(sample_class)[0]
dfs.append(df)
# plot_mel_spectogram(y)
except Exception as e:
skipped.append(segment)
errors.append(e)
final_df = pd.concat(dfs, ignore_index=True)
final_df.to_csv(output_file, index=False)
if len(skipped) > 0:
pd.DataFrame({path_name: skipped, "error": errors}).to_csv(skipped_file, index=False)