Switch to side-by-side view

--- a
+++ b/ispy1/inferential_statistics.py
@@ -0,0 +1,132 @@
+import pandas as _pd
+import numpy as np
+from scipy.stats import chi2_contingency as _chi2
+from scipy.stats import fisher_exact
+import matplotlib as plt
+import seaborn as sns
+from scipy import stats
+from scipy.stats import fisher_exact
+from sklearn import preprocessing
+from statsmodels.formula.api import ols
+import statsmodels.api as sm
+import matplotlib.pyplot as plt
+
+# CATEGORICAL PREDICTORS and CATEGORICAL OUTCOMES
+def contingency_table(predictor, outcome, dataframe):
+    '''
+    contingency_table(predictor, outcome, dataframe)
+    '''
+    T = _pd.crosstab(dataframe[predictor], dataframe[outcome]).astype(float);
+    T = T.loc[['Yes','No']]
+    T = T[['Yes','No']]
+    return T
+
+def relative_risk(T):
+    '''
+    source: https://www.medcalc.org/calc/relative_risk.php
+    RR, lb, ub  = relative_risk(T)
+    Estimate the relavite risk (RR), its lower 95%CI (lb), and its upper 95%CI(ub)
+    '''
+    a = T[0,0]
+    b = T[0,1]
+    c = T[1,0]
+    d = T[1,1]
+    SE = np.sqrt( 1/a + 1/c - 1/(a+b) - 1/(c+d) )
+
+    p_of_first_row = a / (a+b)
+    p_of_seconds_row = c / (c+d)
+    RR = p_of_first_row/p_of_seconds_row
+
+    SE = np.sqrt( 1/a + 1/c - (1/(a+b)) - (1/(c+d)) )
+    CI_95pct_lb = np.exp(np.log(RR) - 1.96 * (SE))
+    CI_95pct_ub = np.exp(np.log(RR) + 1.96 * (SE))
+
+    return np.round(RR,4), np.round(CI_95pct_lb,4), np.round(CI_95pct_ub,4)
+
+def categorical_data(outcome, categorical_predictors, df):
+    '''
+    --------
+    Syntaxis
+    categorical_data(outcome, categorical_predictors, df)
+    --------
+    Inputs:
+    outcome = string with the categorical outcome to be studied
+    predictors = list of strings with the categorical predictors
+    df = Pandas Data Frame with the data
+    -------
+    Returns:
+    Pandas df with the following columns for each predictor
+    p-value : p-value of chi-squared test
+    Relative_Risk: Relative Risk (RR) of first row vs second row
+    RR_lb: Lower bound of the 95% C.I for the RR
+    RR_ub: Upper bound of the 95% C.I for the RR
+    '''
+    #categorical_predictors = df.columns[2:7]
+    num_pred = len(categorical_predictors)
+    df2 = _pd.DataFrame(np.random.randn(num_pred, 4))
+    df2 = df2.set_index([categorical_predictors])
+    df2.columns = ['p-value', 'Relative_Risk','RR_lb','RR_ub']
+
+    for idx, var in enumerate(categorical_predictors):
+        T = contingency_table(var, outcome, df)
+        _, p , _, _= _chi2( T.values )
+        RR, lb, ub = relative_risk(T.values)
+
+        df2.iloc[idx,0] = p;
+        df2.iloc[idx,1] = RR;
+        df2.iloc[idx,2] = lb;
+        df2.iloc[idx,3] = ub;
+    return df2
+
+# Continous PREDICTORS and CATEGORICAL OUTCOMES
+
+
+def linear_models(df, outcome, predictors, print_results = 1):
+    # create new dataframe with predictors
+    df2 = _pd.DataFrame()
+    df2[predictors] = df[predictors]
+
+    # ad outcome to dataframe with predictors encoded as floating
+    df2[outcome]   = preprocessing.LabelEncoder().fit_transform( df[outcome].values )
+
+    # create formula from strings
+    formula = outcome + "~" + "+".join(predictors )
+
+    # perform ANOVA
+    SPY_lm = ols(formula, data = df2 ).fit()
+    anova = sm.stats.anova_lm(SPY_lm, typ=2) # Type 2 ANOVA DataFrame
+    if print_results == 1:
+        print(15*'---')
+        print(anova)
+        print(15*'---')
+    return anova, SPY_lm
+
+def anova_MRI(outcome, df):
+    mri_predictors= ['MRI_LD_Baseline','MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
+    results = results = _pd.DataFrame(np.random.random(size=(len(mri_predictors),1)),
+                                          index=mri_predictors, columns=['p-value'])
+    for idx, pred in enumerate(mri_predictors):
+        p = list();     p.append(pred)
+        nova_table, _ = linear_models(df, outcome, p, print_results=0);
+        results.ix[idx] = nova_table['PR(>F)'].values[0]
+
+    f, (ax1, ax2) = plt.subplots(2,2, figsize=(10,10))
+    sns.boxplot(x= outcome, y=mri_predictors[0], data=df, palette="Set3", ax=ax1[0]).set_title('p-value = '+ str(results.values[0]));
+    sns.boxplot(x= outcome, y=mri_predictors[1], data=df, palette="Set3", ax=ax2[0]).set_title('p-value = '+ str(results.values[1]));
+    sns.boxplot(x= outcome, y=mri_predictors[2], data=df, palette="Set3", ax=ax1[1]).set_title('p-value = '+ str(results.values[2]));
+    sns.boxplot(x= outcome, y=mri_predictors[3], data=df, palette="Set3", ax=ax2[1]).set_title('p-value = '+ str(results.values[3]));
+    plt.show()
+    return results
+
+def effect_size(df,predictors, outcome):
+    all_ =  predictors + [outcome]
+    legend = 'Predictor of ' + outcome
+    mean_ = df[all_].groupby( outcome ).mean().values
+    std_ =  np.std( df[predictors].values.flatten() )
+    delta = mean_[0,:] - mean_[1,:]
+    effect_size = _pd.DataFrame(delta/std_)
+
+    effect_size[legend] = predictors
+    effect_size =    effect_size.set_index(legend)
+    effect_size.columns = ['Effect Size']
+    return effect_size