[1654c6]: / ispy1 / inferential_statistics.py

Download this file

133 lines (114 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import pandas as _pd
import numpy as np
from scipy.stats import chi2_contingency as _chi2
from scipy.stats import fisher_exact
import matplotlib as plt
import seaborn as sns
from scipy import stats
from scipy.stats import fisher_exact
from sklearn import preprocessing
from statsmodels.formula.api import ols
import statsmodels.api as sm
import matplotlib.pyplot as plt
# CATEGORICAL PREDICTORS and CATEGORICAL OUTCOMES
def contingency_table(predictor, outcome, dataframe):
'''
contingency_table(predictor, outcome, dataframe)
'''
T = _pd.crosstab(dataframe[predictor], dataframe[outcome]).astype(float);
T = T.loc[['Yes','No']]
T = T[['Yes','No']]
return T
def relative_risk(T):
'''
source: https://www.medcalc.org/calc/relative_risk.php
RR, lb, ub = relative_risk(T)
Estimate the relavite risk (RR), its lower 95%CI (lb), and its upper 95%CI(ub)
'''
a = T[0,0]
b = T[0,1]
c = T[1,0]
d = T[1,1]
SE = np.sqrt( 1/a + 1/c - 1/(a+b) - 1/(c+d) )
p_of_first_row = a / (a+b)
p_of_seconds_row = c / (c+d)
RR = p_of_first_row/p_of_seconds_row
SE = np.sqrt( 1/a + 1/c - (1/(a+b)) - (1/(c+d)) )
CI_95pct_lb = np.exp(np.log(RR) - 1.96 * (SE))
CI_95pct_ub = np.exp(np.log(RR) + 1.96 * (SE))
return np.round(RR,4), np.round(CI_95pct_lb,4), np.round(CI_95pct_ub,4)
def categorical_data(outcome, categorical_predictors, df):
'''
--------
Syntaxis
categorical_data(outcome, categorical_predictors, df)
--------
Inputs:
outcome = string with the categorical outcome to be studied
predictors = list of strings with the categorical predictors
df = Pandas Data Frame with the data
-------
Returns:
Pandas df with the following columns for each predictor
p-value : p-value of chi-squared test
Relative_Risk: Relative Risk (RR) of first row vs second row
RR_lb: Lower bound of the 95% C.I for the RR
RR_ub: Upper bound of the 95% C.I for the RR
'''
#categorical_predictors = df.columns[2:7]
num_pred = len(categorical_predictors)
df2 = _pd.DataFrame(np.random.randn(num_pred, 4))
df2 = df2.set_index([categorical_predictors])
df2.columns = ['p-value', 'Relative_Risk','RR_lb','RR_ub']
for idx, var in enumerate(categorical_predictors):
T = contingency_table(var, outcome, df)
_, p , _, _= _chi2( T.values )
RR, lb, ub = relative_risk(T.values)
df2.iloc[idx,0] = p;
df2.iloc[idx,1] = RR;
df2.iloc[idx,2] = lb;
df2.iloc[idx,3] = ub;
return df2
# Continous PREDICTORS and CATEGORICAL OUTCOMES
def linear_models(df, outcome, predictors, print_results = 1):
# create new dataframe with predictors
df2 = _pd.DataFrame()
df2[predictors] = df[predictors]
# ad outcome to dataframe with predictors encoded as floating
df2[outcome] = preprocessing.LabelEncoder().fit_transform( df[outcome].values )
# create formula from strings
formula = outcome + "~" + "+".join(predictors )
# perform ANOVA
SPY_lm = ols(formula, data = df2 ).fit()
anova = sm.stats.anova_lm(SPY_lm, typ=2) # Type 2 ANOVA DataFrame
if print_results == 1:
print(15*'---')
print(anova)
print(15*'---')
return anova, SPY_lm
def anova_MRI(outcome, df):
mri_predictors= ['MRI_LD_Baseline','MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
results = results = _pd.DataFrame(np.random.random(size=(len(mri_predictors),1)),
index=mri_predictors, columns=['p-value'])
for idx, pred in enumerate(mri_predictors):
p = list(); p.append(pred)
nova_table, _ = linear_models(df, outcome, p, print_results=0);
results.ix[idx] = nova_table['PR(>F)'].values[0]
f, (ax1, ax2) = plt.subplots(2,2, figsize=(10,10))
sns.boxplot(x= outcome, y=mri_predictors[0], data=df, palette="Set3", ax=ax1[0]).set_title('p-value = '+ str(results.values[0]));
sns.boxplot(x= outcome, y=mri_predictors[1], data=df, palette="Set3", ax=ax2[0]).set_title('p-value = '+ str(results.values[1]));
sns.boxplot(x= outcome, y=mri_predictors[2], data=df, palette="Set3", ax=ax1[1]).set_title('p-value = '+ str(results.values[2]));
sns.boxplot(x= outcome, y=mri_predictors[3], data=df, palette="Set3", ax=ax2[1]).set_title('p-value = '+ str(results.values[3]));
plt.show()
return results
def effect_size(df,predictors, outcome):
all_ = predictors + [outcome]
legend = 'Predictor of ' + outcome
mean_ = df[all_].groupby( outcome ).mean().values
std_ = np.std( df[predictors].values.flatten() )
delta = mean_[0,:] - mean_[1,:]
effect_size = _pd.DataFrame(delta/std_)
effect_size[legend] = predictors
effect_size = effect_size.set_index(legend)
effect_size.columns = ['Effect Size']
return effect_size