-
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutilsviz.py
84 lines (72 loc) · 3.2 KB
/
utilsviz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/bin/env python
# coding: utf-8
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
nunique = df.nunique()
# For displaying purposes, pick columns that have between 1 and 50 unique values
df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]]
nRow, nCol = df.shape
columnNames = list(df)
nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
plt.figure(num=None, figsize=(6 * nGraphPerRow, 8 * nGraphRow),
dpi=80, facecolor='w', edgecolor='k')
for i in range(min(nCol, nGraphShown)):
plt.subplot(nGraphRow, nGraphPerRow, i + 1)
columnDf = df.iloc[:, i]
if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
valueCounts = columnDf.value_counts()
valueCounts.plot.bar()
else:
columnDf.hist()
plt.ylabel('counts')
plt.xticks(rotation=90)
plt.title(f'{columnNames[i]} (column {i})')
plt.tight_layout(pad=1.0, w_pad=1.0, h_pad=1.0)
plt.show()
# In[4]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
filename = ""
df = df.dropna('columns') # drop columns with NaN
# keep columns where there are more than 1 unique values
df = df[[col for col in df if df[col].nunique() > 1]]
if df.shape[1] < 2:
print(
f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
return
corr = df.corr()
plt.figure(num=None, figsize=(graphWidth, graphWidth),
dpi=80, facecolor='w', edgecolor='k')
corrMat = plt.matshow(corr, fignum=1)
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.columns)), corr.columns)
plt.gca().xaxis.tick_bottom()
plt.colorbar(corrMat)
plt.title(f'Correlation Matrix for {filename}', fontsize=15)
plt.show()
# In[5]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
df = df.select_dtypes(include=[np.number]) # keep only numerical columns
# Remove rows and columns that would lead to df being singular
df = df.dropna('columns')
# keep columns where there are more than 1 unique values
df = df[[col for col in df if df[col].nunique() > 1]]
columnNames = list(df)
if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
columnNames = columnNames[:10]
df = df[columnNames]
ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[
plotSize, plotSize], diagonal='kde')
corrs = df.corr().values
for i, j in zip(*plt.np.triu_indices_from(ax, k=1)):
ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2),
xycoords='axes fraction', ha='center', va='center', size=textSize)
plt.suptitle('Scatter and Density Plot')
plt.show()