-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdatasource_analysis.py
139 lines (116 loc) · 5.84 KB
/
datasource_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# External imports
import os
import json
import pandas as pd
# Project level imports
import transcription
import config
# Get all files from a directory and subdirectories, or all files of a specified type (eg: ".wav")
# returns a list of file paths
# def getAllFiles(rootSearchPath, specificExt):
# filePaths = []
# # Search dirs and subdirs
# for root, dirs, files in os.walk(rootSearchPath):
# for file in files:
# if specificExt == "":
# filePaths.append(os.path.join(root, file))
# elif file.endswith(specificExt):
# filePaths.append(os.path.join(root, file))
#
# print("Found", len(filePaths), "files in", rootSearchPath)
# return filePaths
# Get the _KEYS.json file containing the identifier structure
def getKeyIdentifiers(path, datasourceName):
# Get the key value pairs from the file to begin labelling
try:
with open(os.path.join(path, datasourceName + "_KEYS.json"), "r") as keyFile:
try:
return json.load(keyFile)
except json.JSONDecodeError as e:
print("ERROR - Please ensure the *_KEYS.json file is structured correctly.")
print(e)
return ""
except FileNotFoundError as e:
print("ERROR - Please ensure the *_KEYS.json file exists.")
print(e)
return ""
# Utilising the identifer-value pairs label the files accordingly
def labelAllAudioFiles(identifierDict, rootSearchPath, specificExt):
# Identify the delimiter to use on the filenames
delimiter = ""
if "delimiter" in identifierDict.keys():
delimiter = identifierDict.pop("delimiter")
# Identify if statements are given, else use transcription
hasStatements = False
if "statements" in identifierDict.keys():
if identifierDict["statements"] != "":
hasStatements = True
else:
print("Warning - No statements given, using transcription. This may take some time.")
# Store the file info as a dict, path as key
result = {}
# Search dirs and subdirs
for root, dirs, files in os.walk(rootSearchPath):
for file in files:
# Convert the filename, into it's identifer components
filenameComponents = os.path.splitext(file)[0]
if delimiter != "":
filenameComponents = filenameComponents.split(delimiter)
if specificExt == "" or file.endswith(specificExt):
# For each of the file identifier categories (emotion, gender, modality, etc)
fileInfo = dict.fromkeys(identifierDict.keys())
for k in identifierDict.keys():
# When no statements are given transcribe
if k == "statements" and not hasStatements:
print("Transcribing Speech From File:", file)
fileInfo[k] = transcription.transcribe(os.path.join(root, file))
else:
possibleIdentifiers = identifierDict[k]["keyValue"].keys()
identifierIndex = identifierDict[k]["index"]
identifierSize = identifierDict[k]["size"]
# Compare the actual file identifier with the values as defined in *_KEYS.json
fileIdValue = filenameComponents[identifierIndex:(identifierIndex+identifierSize)]
if fileIdValue in possibleIdentifiers:
fileInfo[k] = identifierDict[k]["keyValue"][fileIdValue]
else:
print("WARNING - Unable to determine filename identifier (", fileIdValue, ") for file:", file)
# Store the file path, and the file info dict
result[os.path.join(root, file)] = fileInfo
return result
# Utilises a datasource name to identify and file extension and label each data file using keys from an external file
# returns a pandas dataframe of file paths and their corresponding labels or ""
def identifyData(dataName, dataType, fileExt):
# Store the basepath of the datasource in question
basepath = os.path.join(config.cfg["datasource_path"], dataName)
# Display a notice to the user
print("Starting data identification process\nType:", dataType, "\nLocation:", basepath,"\n")
# Identify the type of data (Audio - Multiple Files, or Text - Single File)
if dataType != "AUDIO" and dataType != "TEXT":
print("ERROR - Please ensure the type is either 'AUDIO' or 'TEXT'")
exit(-1)
# Process the audio files - labelling
if dataType == "AUDIO":
# Get the identifiers dict
identifierDict = getKeyIdentifiers(basepath, dataName)
# Get the files
dataFiles = labelAllAudioFiles(identifierDict, os.path.join(basepath, dataName + "_DATA"), fileExt)
# If no data files were found return an empty string
if len(dataFiles) == 0:
print("ERROR - No files were returned from the labelling process")
return ""
# Use the filename, and key-value identifiers to label
else:
print(len(dataFiles), "files were returned from the labelling process")
return pd.DataFrame.from_dict(dataFiles, orient='index')
# Load with pandas or alternative and we may still want to use the _KEYS file just adjusted for dataframes
else:
try:
if fileExt.lower() == '.csv':
return pd.read_csv(os.path.join(basepath, dataName + "_DATA" + fileExt))
elif fileExt.lower() == '.xlsx':
return pd.read_excel(os.path.join(basepath, dataName + "_DATA" + fileExt))
else:
print("Error - Unsupported file extension ("+fileExt+")\n")
except FileNotFoundError as e:
print("Error - No file was found\n", e)
# print(identifyData("RAVDESS", "AUDIO", ".wav"))