forked from markusloecher/DataScience-HWR
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathourFunctions.py
171 lines (124 loc) · 5.11 KB
/
ourFunctions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import numpy as np
import matplotlib as matplt
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy.stats import norm
from scipy import stats
from numpy import random
#random.seed(42)
def ecdf(data):
"""Compute ECDF for a one-dimensional array of measurements."""
# Number of data points: n
n = len(data)
# x-data for the ECDF: x
x = np.sort(data)
# y-data for the ECDF: y
y = np.arange(1, n+1) / n
return x, y
def bootstrap_replicate_1d(data, func):
return func(np.random.choice(data, size=len(data)))
def draw_bs_reps(data, func, size=1):
"""Draw bootstrap replicates."""
# Initialize array of replicates: bs_replicates
bs_replicates = np.empty(size)
# Generate replicates
for i in range(size):
bs_replicates[i] = bootstrap_replicate_1d(data, func)
return bs_replicates
def bs_2sample_test(xA, xB, func, direction =("two-sided","left", "right")[0], size=1000):
# Compute "pooled" mean
mean_overall = np.mean([xA,xB])
empirical_diff_means = np.mean(xA)-np.mean(xB)
# Generate shifted arrays
xA_underNull = xA - np.mean(xA) + mean_overall
xB_underNull = xB - np.mean(xB) + mean_overall
# Compute 10,000 bootstrap replicates from shifted arrays
bs_replicates_m = draw_bs_reps(xA_underNull, np.mean, size=size)
bs_replicates_f = draw_bs_reps(xB_underNull, np.mean, size=size)
# Get replicates of difference of means: bs_replicates
bs_replicates = bs_replicates_m - bs_replicates_f
# Compute and print p-value: p
if direction == "two-sided":
p = np.sum(np.abs(bs_replicates) >= np.abs(empirical_diff_means)) / len(bs_replicates)
if direction == "left":
p = np.sum(bs_replicates <= empirical_diff_means) / len(bs_replicates)
if direction == "right":
p = np.sum(bs_replicates >= empirical_diff_means) / len(bs_replicates)
print('p-value =', p)
return bs_replicates
def plot2ECDFs(x1, x2,leg=('male', 'female'),xlab='birth weight(g)',ylab='ECDF',title=''):
# Compute ECDF for sample size 40: m_40, f_40
mx_40, my_40 = ecdf(x1)
fx_40, fy_40 = ecdf(x2)
# Plot all ECDFs on the same plot
_ = plt.plot(mx_40, my_40, marker = '.', linestyle = 'none')
_ = plt.plot(fx_40, fy_40, marker = '.', linestyle = 'none')
# Make nice margins
plt.margins(0.02)
# Annotate the plot
plt.legend(leg, loc='lower right')
_ = plt.xlabel(xlab)
_ = plt.ylabel(ylab)
_ = plt.title(title)
# Display the plot
plt.grid()
plt.show()
def mean_density_comparison(M=500, n=10):
#Generate an gender iteration array
gender_iter = ['male', 'female']
#Create an empty DataFrame with 'gender' and 'dbirwt' column
columns = ['gender', 'dbirwt']
df_new = pd.DataFrame(columns=columns)
#Create an empty array to store the standard deviation of the differnt gender 'male' = std_dev[0], 'female' = std_dev[1]
std_dev = np.empty(2)
#Iterate over gender and create a specific data subset
for ind,v in enumerate(gender_iter):
subset = df_cleaned[df_cleaned.gender == v]
#create M random sample means of n samples and add it to df_new
for i in range(M):
rand_samples = np.random.choice(subset.dbirwt, n)
x = np.mean(rand_samples)
df_new.loc[len(df_new)+1] = [v, x]
#plot male and female data and calculate the standard daviation of the data
plot_data = df_new[df_new.gender == v]
std_dev[ind] = np.std(plot_data['dbirwt'])
plot_data.dbirwt.plot.density()
plt.xlabel('dbirwt')
plt.legend(gender_iter)
#plt.grid()
#plt.title("n=" + str(n))
#return the sample mean data
return df_new
#return the standard deviation of ['male', 'female']
#return std_dev
# Test the function
# SM40 = mean_density_comparison(M=100, n=40)
# plt.figure()
# SM640 = mean_density_comparison(M=100, n=640)
# # Permutation tests
def permutation_sample(data1, data2):
"""Generate a permutation sample from two data sets."""
# Concatenate the data sets: data
data = np.concatenate((data1, data2))
# Permute the concatenated array: permuted_data
permuted_data = np.random.permutation(data)
# Split the permuted array into two: perm_sample_1, perm_sample_2
perm_sample_1 = permuted_data[:len(data1)]
perm_sample_2 = permuted_data[len(data1):]
return perm_sample_1, perm_sample_2
def draw_perm_reps(data_1, data_2, func, size=1):
"""Generate multiple permutation replicates."""
# Initialize array of replicates: perm_replicates
perm_replicates = np.empty(size)
for i in range(size):
# Generate permutation sample
perm_sample_1, perm_sample_2 = permutation_sample(data_1, data_2)
# Compute the test statistic
perm_replicates[i] = func(perm_sample_1, perm_sample_2)
return perm_replicates
def diff_of_means(data_1, data_2):
"""Difference in means of two arrays."""
# The difference of means of data_1, data_2: diff
diff = np.mean(data_1) - np.mean(data_2)
return diff