-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathbasic_metadata.py
227 lines (174 loc) · 7.5 KB
/
basic_metadata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import sys
from ds_reader import datasets_to_dataframes
from pyspark.sql import DataFrame, Column, Row
from pyspark.sql.functions import countDistinct, create_map, lit, col, explode, count, when, isnan, isnull, sum, collect_list, udf
from pyspark.sql.types import ArrayType, StringType
import time
from typing import Union, Callable, Dict, Iterable
spark_col = col
OUTPUT_KEY = 'columns' # key mapped to a dataset which holds the column output (like column metadata)
def map_cols(df: DataFrame) -> Column:
"""
maps column names to column values using Spark's MapType. A column of Spark "Maps" is returned where each row represents a Map like {column_name: [column_vals]...}.
It then explodes (flattens) the output of map_cols so that each row maps one column_name to one column_value, i.e. [col1: val1, col1: val2...]
Spark "Column" (of dataset columns) is returned
"""
df_mapped_cols = df.select(
create_map(
*
[j
for l
in
[[lit(col),
spark_col(col)] for col in df.columns] for j in
l]))
df_exploded_cols = df_mapped_cols.select(
explode(*df_mapped_cols.columns))
return df_exploded_cols
def reduce_cols(df_cols: Union[DataFrame, Column],
agg_name: str, aggFunc: Callable, other_groupBy_keys: Iterable[Column] = []) -> DataFrame:
"""
accepts map_cols output (flattened columns of columns) and groups by column name using the aggFunc.
agg_name is later used in append_output as a key to get the outputs and add them to the output dict.
other_groupBy_keys is used in cases such as getting the top 5 frequent vals
"""
return df_cols.groupBy(
spark_col("key").alias("col_name"), *other_groupBy_keys).agg(
aggFunc("value").alias(agg_name))
def append_output(ds_dct: Dict, result_key: str, df_output: DataFrame):
"""
adds result_key (a column_name in df_output) to ds_dct (output dict for a dataset), where it is mapped to the appropriate column name.
is used for the output of reduce_cols.
"""
output_rows = df_output.collect()
for row in output_rows:
col_name = row['col_name']
if not col_name in ds_dct[OUTPUT_KEY]:
ds_dct[OUTPUT_KEY][col_name] = {'column_name': col_name}
# if result_key already in dct then change val into list and append value (like for frequent values)
if result_key in ds_dct[OUTPUT_KEY][col_name]:
if not isinstance(ds_dct[OUTPUT_KEY][col_name][result_key], list):
ds_dct[OUTPUT_KEY][col_name][result_key] = [
ds_dct[OUTPUT_KEY][col_name][result_key]]
ds_dct[OUTPUT_KEY][col_name][result_key].append(row[result_key])
# if result_key not in dct then set result_key = val
else:
ds_dct[OUTPUT_KEY][col_name][result_key] = row[result_key]
def get_count_distincts(df_cols: Union[DataFrame, Column], ds_dct: Dict):
df_output = reduce_cols(df_cols, 'number_distinct_values',
countDistinct)
append_output(ds_dct, 'number_distinct_values', df_output)
def get_top_5(df_cols: Union[DataFrame, Column], ds_dct: Dict):
df_output = reduce_cols(df_cols, 'count', count, other_groupBy_keys=[
spark_col('value').alias("frequent_values")])
counts = df_output.sort('count', ascending=False) # sort only on count for speed
# counts.sort('col_name', 'count', ascending=False).show() # DEBUG
df_output = counts.groupBy('col_name').agg(
collect_list('frequent_values').alias('frequent_values'))
top_5 = udf(lambda lst: [str(val)
for val in lst[:5]], ArrayType(StringType())) # take top 5
df_output = df_output.select('col_name', top_5(
'frequent_values').alias('frequent_values'))
append_output(ds_dct, 'frequent_values', df_output)
def get_non_empty(df_cols: Union[DataFrame, Column], ds_dct: Dict):
df_output = reduce_cols(df_cols, 'number_non_empty_cells',
lambda c: count(when(~(isnan(c) | isnull(c)), c)))
append_output(ds_dct, 'number_non_empty_cells', df_output)
def get_empty(df_cols: Union[DataFrame, Column], ds_dct: Dict):
df_output = reduce_cols(df_cols, 'number_empty_cells',
lambda c: count(when(isnan(c) | isnull(c), c)))
append_output(ds_dct, 'number_empty_cells', df_output)
def get_basic_metadata(df_cols: Union[DataFrame, Column], ds_dct: Dict):
"""
driver function that updates ds_dct with the following:
{
"columns": {
"<column_name>": {
"column_name": <string>,
"number_distinct_values": <int>,
"frequent_values": [
<string1>,
<string2>,
<string3>,
<string4>,
<string5>,
],
"number_non_empty_cells": <int>,
"number_empty_cells": <int>
...
}
}
"""
get_non_empty(df_cols, ds_dct)
get_empty(df_cols, ds_dct)
get_count_distincts(df_cols, ds_dct)
get_top_5(df_cols, ds_dct)
def get_val_from_single_val_col(df: Column):
return list(df.collect()[0].asDict().values())[0]
def __test_compare(dfs):
"""
compare Spark implementation to naive one (iterate over each row with python) for count_distinct
"""
limit = 20
for i, df in enumerate(dfs):
start = time.time()
ds_dct = {
'dataset': df.ds_name,
OUTPUT_KEY: {}
}
df_cols = map_cols(df)
get_count_distincts(df_cols, ds_dct)
end = time.time()
print(end-start)
print(ds_dct)
###
start = time.time()
ds_dct = {
'dataset': df.ds_name,
OUTPUT_KEY: {}
}
for col_name in df.columns:
num_distincts = get_val_from_single_val_col(
df.agg(countDistinct(col_name)))
if not col_name in ds_dct[OUTPUT_KEY]:
ds_dct[OUTPUT_KEY][col_name] = {'column_name': col_name}
ds_dct[OUTPUT_KEY][col_name]['number_distinct_values'] = num_distincts
end = time.time()
print(end-start)
print(ds_dct)
if i >= limit - 1:
exit(0)
def test():
"""
test_timing with standardized interface in timing module
"""
from timing import timed
# master_dct contains all output to be used in json
master_dct = {}
def _run(df, i):
### MAIN ###
# dct belongs to a dataset
dct = {
'dataset_name': df.ds_name,
OUTPUT_KEY: {}
}
df_cols = map_cols(df)
get_basic_metadata(df_cols, dct) # main driver
### test num_cells (counts) ###
# rows = reduce_cols(df_cols, 'number_cells', lambda c: sum(lit(1)))
# append_output(dct, 'number_cells', rows)
# for col in dct[OUTPUT_KEY].keys():
# if dct[OUTPUT_KEY][col]['number_cells'] != dct[OUTPUT_KEY][col]['number_non_empty_cells'] + dct[OUTPUT_KEY][col]['number_empty_cells']:
# raise ValueError('Error in dataset: {}. Column: {}'.format(
# dct.keys()[0], str(dct[OUTPUT_KEY][col])))
# update master_dct with dataset
master_dct.update({dct['dataset_name']: dct})
return dct
timed(_run)
# DEBUG print all output
# print()
# print("### MASTER_DCT###")
# print(master_dct)
if __name__ == '__main__':
# use cli as "ds_reader.py <hdfs dir> <limit>" where limit is an optional int
test()