Insights-into-the-I-SPY-c / Git / [1654c6] /ispy1/inferential

Models:
joseph-gordon/
Insights-into-the-I-SPY-c
Downloads: 1
[1654c6]: / ispy1 / inferential_statistics.py
History
Download this file
133 lines (114 with data), 4.9 kB

import pandas as _pd
import numpy as np
from scipy.stats import chi2_contingency as _chi2
from scipy.stats import fisher_exact
import matplotlib as plt
import seaborn as sns
from scipy import stats
from scipy.stats import fisher_exact
from sklearn import preprocessing
from statsmodels.formula.api import ols
import statsmodels.api as sm
import matplotlib.pyplot as plt

# CATEGORICAL PREDICTORS and CATEGORICAL OUTCOMES
def contingency_table(predictor, outcome, dataframe):
    '''
    contingency_table(predictor, outcome, dataframe)
    '''
    T = _pd.crosstab(dataframe[predictor], dataframe[outcome]).astype(float);
    T = T.loc[['Yes','No']]
    T = T[['Yes','No']]
    return T

def relative_risk(T):
    '''
    source: https://www.medcalc.org/calc/relative_risk.php
    RR, lb, ub  = relative_risk(T)
    Estimate the relavite risk (RR), its lower 95%CI (lb), and its upper 95%CI(ub)
    '''
    a = T[0,0]
    b = T[0,1]
    c = T[1,0]
    d = T[1,1]
    SE = np.sqrt( 1/a + 1/c - 1/(a+b) - 1/(c+d) )

    p_of_first_row = a / (a+b)
    p_of_seconds_row = c / (c+d)
    RR = p_of_first_row/p_of_seconds_row

    SE = np.sqrt( 1/a + 1/c - (1/(a+b)) - (1/(c+d)) )
    CI_95pct_lb = np.exp(np.log(RR) - 1.96 * (SE))
    CI_95pct_ub = np.exp(np.log(RR) + 1.96 * (SE))

    return np.round(RR,4), np.round(CI_95pct_lb,4), np.round(CI_95pct_ub,4)

def categorical_data(outcome, categorical_predictors, df):
    '''
    --------
    Syntaxis
    categorical_data(outcome, categorical_predictors, df)
    --------
    Inputs:
    outcome = string with the categorical outcome to be studied
    predictors = list of strings with the categorical predictors
    df = Pandas Data Frame with the data
    -------
    Returns:
    Pandas df with the following columns for each predictor
    p-value : p-value of chi-squared test
    Relative_Risk: Relative Risk (RR) of first row vs second row
    RR_lb: Lower bound of the 95% C.I for the RR
    RR_ub: Upper bound of the 95% C.I for the RR
    '''
    #categorical_predictors = df.columns[2:7]
    num_pred = len(categorical_predictors)
    df2 = _pd.DataFrame(np.random.randn(num_pred, 4))
    df2 = df2.set_index([categorical_predictors])
    df2.columns = ['p-value', 'Relative_Risk','RR_lb','RR_ub']

    for idx, var in enumerate(categorical_predictors):
        T = contingency_table(var, outcome, df)
        _, p , _, _= _chi2( T.values )
        RR, lb, ub = relative_risk(T.values)

        df2.iloc[idx,0] = p;
        df2.iloc[idx,1] = RR;
        df2.iloc[idx,2] = lb;
        df2.iloc[idx,3] = ub;
    return df2

# Continous PREDICTORS and CATEGORICAL OUTCOMES


def linear_models(df, outcome, predictors, print_results = 1):
    # create new dataframe with predictors
    df2 = _pd.DataFrame()
    df2[predictors] = df[predictors]

    # ad outcome to dataframe with predictors encoded as floating
    df2[outcome]   = preprocessing.LabelEncoder().fit_transform( df[outcome].values )

    # create formula from strings
    formula = outcome + "~" + "+".join(predictors )

    # perform ANOVA
    SPY_lm = ols(formula, data = df2 ).fit()
    anova = sm.stats.anova_lm(SPY_lm, typ=2) # Type 2 ANOVA DataFrame
    if print_results == 1:
        print(15*'---')
        print(anova)
        print(15*'---')
    return anova, SPY_lm

def anova_MRI(outcome, df):
    mri_predictors= ['MRI_LD_Baseline','MRI_LD_1_3dAC', 'MRI_LD_Int_Reg', 'MRI_LD_PreSurg']
    results = results = _pd.DataFrame(np.random.random(size=(len(mri_predictors),1)),
                                          index=mri_predictors, columns=['p-value'])
    for idx, pred in enumerate(mri_predictors):
        p = list();     p.append(pred)
        nova_table, _ = linear_models(df, outcome, p, print_results=0);
        results.ix[idx] = nova_table['PR(>F)'].values[0]

    f, (ax1, ax2) = plt.subplots(2,2, figsize=(10,10))
    sns.boxplot(x= outcome, y=mri_predictors[0], data=df, palette="Set3", ax=ax1[0]).set_title('p-value = '+ str(results.values[0]));
    sns.boxplot(x= outcome, y=mri_predictors[1], data=df, palette="Set3", ax=ax2[0]).set_title('p-value = '+ str(results.values[1]));
    sns.boxplot(x= outcome, y=mri_predictors[2], data=df, palette="Set3", ax=ax1[1]).set_title('p-value = '+ str(results.values[2]));
    sns.boxplot(x= outcome, y=mri_predictors[3], data=df, palette="Set3", ax=ax2[1]).set_title('p-value = '+ str(results.values[3]));
    plt.show()
    return results

def effect_size(df,predictors, outcome):
    all_ =  predictors + [outcome]
    legend = 'Predictor of ' + outcome
    mean_ = df[all_].groupby( outcome ).mean().values
    std_ =  np.std( df[predictors].values.flatten() )
    delta = mean_[0,:] - mean_[1,:]
    effect_size = _pd.DataFrame(delta/std_)

    effect_size[legend] = predictors
    effect_size =    effect_size.set_index(legend)
    effect_size.columns = ['Effect Size']
    return effect_size