growth-curves / Git / [d8a979] /code/adiposity_rebound/ar

Models:
RaymondKing/
growth-curves
Downloads: 1
[d8a979]: / code / adiposity_rebound / ar_calc.py
History
Download this file
426 lines (335 with data), 17.5 kB

##### SETUP ######

#import sys
#sys.path.append('../visualize')
#import vis_quartiles_individual

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
from scipy import interpolate
import math

##### VARIABLES ######

attributes = ["ht","wt","bmi"]
percentiles = np.array([3, 5, 10, 25, 50, 75, 85, 90, 95, 97])
intervals = np.trunc(np.concatenate((np.array([0.01]),np.arange(.05,.2,.05),np.arange(.4,2,.2), np.arange(2,20,.5)))*100)/100

## Plot figures?
plot_bool = False

## Plot for blog style if plot_bool is true?
plot_for_blog = False

## Input individual patients? (Otherwise pulls all patients from BMI_filtered_contain_age5.pkl)
input_patient_bool = False

## Individual patient to analyze if input_patient_bool is true
patient_id = 16785 #16785, 10258, 9026, 12322, 11995, 12552

## Limit number of patients plot
count_bool = False

## Plot log on y axis?
plot_log = False
font_size = 14


######################

##### FUNCTIONS ######

def find_percentile(id_patient, age, grouped_age, grouped_patient_age):                                     
    df_datapoint = grouped_patient_age.get_group((id_patient, age))
    ht = df_datapoint["ht"]
    wt = df_datapoint["wt"]
    bmi = df_datapoint["bmi"]
    gender = df_datapoint["gender"]
                                     
    group_age = grouped_age.get_group(age)
    group_age = group_age[group_age["gender"] == gender.iloc[0]]
    
    num_patients = group_age.shape[0]
    percent_ht = np.sum(group_age["ht"] < ht.iloc[0])*100.0/num_patients
    percent_wt = np.sum(group_age["wt"] < wt.iloc[0])*100.0/num_patients
    percent_bmi = np.sum(group_age["bmi"] < bmi.iloc[0])*100.0/num_patients
    return (percent_ht, percent_wt, percent_bmi, ht.iloc[0], wt.iloc[0], bmi.iloc[0])

######################

## Open pickle file, saved from bmi_adiposity_rebound_filter.py
## df_range contains individuals that have data in the age range where AR occurs
df = pickle.load(open('../../data/pkl/BMI_filtered_contain_age5.pkl', 'rb'))

## Open pickle file, saved from bmi_initial_processing.py
df_resampled = pickle.load(open('../../data/pkl/BMI_resampled.pkl', 'rb'))

## Already ran calculation ##
## Keep only method 4 results ##
#pat_list = pd.read_csv("../../data/csv/BMI_ar_4_method4.csv", names = ["Patient ID"])
#df = df[df["Patient ID"].isin(pat_list["Patient ID"])]

## Group datapoints for each patient
grouped = df.groupby(["id"])

resampled_grouped_age = df_resampled.groupby(["age"])
resampled_grouped_patient_age = df_resampled.groupby(["id", "age"])

## Create new list to add AR information for each patient
df_ar_list = []
                           
## Iterate through each patient
count = 0
for name_patient, group_patient in grouped:

    #print name_patient
    if count_bool:
        count = count + 1
        if count < 0:
            continue
        if count == 20:
            break
    if input_patient_bool:
        name_patient = patient_id    
        group_patient = grouped.get_group(name_patient)
    #print name_patient
    #print group_patient

    # Throw out data less than 2 years
    group_patient = group_patient[group_patient["age"] >= 2.0]
    
    if plot_bool and not plot_for_blog:
        ## Create figure to hold patient-specific plots 
        fig = plt.figure()
        ax_age_ht = fig.add_subplot(221)
        ax_age_wt = fig.add_subplot(222)
        ax_age_bmi = fig.add_subplot(223)
        ax_velo = fig.add_subplot(224) ## First derivative of log(ht), log(wt), and log(bmi)
    elif plot_for_blog:
        fig = plt.figure(1, figsize=(15, 4.75))
        fig.subplots_adjust(bottom=0.15, left=0.05, right=0.975)
        ax_age_ht = fig.add_subplot(131)
        ax_age_wt = fig.add_subplot(132) 
        ax_age_bmi = fig.add_subplot(133)
        fig2 = plt.figure()
        ax_velo = fig2.add_subplot(111)
        
    ## Count number of datapoints for this patient
    num_x = group_patient.shape[0]
    
    # And at least 5 datapoints?
    if num_x < 5:
        continue
    
    ## Regression of log(ht) on age
    x = group_patient["age"]
    log_ht = np.log(group_patient["ht"])
    y = log_ht

    ## Model with cubic equation
    X = np.column_stack((x*x*x, x*x, x, np.ones(num_x)))

    ## Make OLS model
    model = sm.OLS(y,X)
    results = model.fit()

    ## Sample fit
    b = results.params
    xnew = np.linspace(x.min(),x.max(),100)
    ynew_ht_log = b[0]*xnew*xnew*xnew + b[1]*xnew*xnew + b[2]*xnew + b[3]
    ynew_ht = np.power(math.e, ynew_ht_log)

    ## Calculate first derivative of log(ht)
    ynew_ht_log_velo = 3*b[0]*xnew*xnew + 2*b[1]*xnew + b[2]

    if plot_bool:
        ax_velo.plot(xnew,2* ynew_ht_log_velo, 'r-', label="2*d(log(Height))/dt")
        if plot_log:
            ## Plot original data points and fit on log-linear plot
            ax_age_ht.plot(x,y, 'ro')
            ax_age_ht.plot(xnew,ynew_ht_log, 'r-')
        else:
            ## Plot original data points and fit on lin-lin plot
            ax_age_ht.plot(x,group_patient["ht"], 'ro')
            ax_age_ht.plot(xnew,ynew_ht, 'r-')

    ## Regression of log(wt) on age
    x = group_patient["age"]
    log_wt = np.log(group_patient["wt"])
    y = log_wt

    ## Model with cubic equation
    X = np.column_stack((x*x*x, x*x, x, np.ones(num_x)))

    ## Make RLM model
    #model = sm.OLS(y,X)
    model = sm.RLM(y,X)

    ## Sample fit
    results = model.fit()
    b = results.params
    xnew = np.linspace(x.min(),x.max(),100)
    ynew_wt_log = b[0]*xnew*xnew*xnew + b[1]*xnew*xnew + b[2]*xnew + b[3]
    ynew_wt = np.power(math.e, ynew_wt_log)

    ## Calculate first derivative of log(ht)
    ynew_wt_log_velo = 3*b[0]*xnew*xnew + 2*b[1]*xnew + b[2]

    if plot_bool:
        ax_velo.plot(xnew,ynew_wt_log_velo, 'b-', label="d(log(Weight))/dt")
        if plot_log:
            ## Plot original data points and fit on lin-lin plot
            ax_age_wt.plot(x,y, 'bo')
            ax_age_wt.plot(xnew,ynew_wt_log, 'b-')
        else:
            ## Plot original data points and fit on log-linear plot
            ax_age_wt.plot(x,group_patient["wt"], 'bo')
            ax_age_wt.plot(xnew,ynew_wt, 'b-')

    ## Calculate first derivative of log(bmi) from first derivatives of log(wt) and log(ht)
    bmi_velo = np.subtract(ynew_wt_log_velo, 2*ynew_ht_log_velo)

    if plot_bool:
        ## Plot first derivative of log(bmi) on log-linear plot
        ax_velo.plot(xnew, bmi_velo, 'g-', label = "d(log(BMI))/dt")
        ax_velo.axhline(y=0)

        ## Plot original BMI datapoints
        ax_age_bmi.plot(group_patient["age"],group_patient["bmi"], 'go')

        ynew_bmi = np.divide(ynew_wt, np.power(ynew_ht, 2)) * 703
        ax_age_bmi.plot(xnew, ynew_bmi, 'g-')
   
        ## Set axis labels
        ax_age_ht.set_xlabel("Age (years)")
        ax_age_wt.set_xlabel("Age (years)")
        ax_age_bmi.set_xlabel("Age (years)")
        if plot_log:
            ax_age_ht.set_ylabel("log(Height (inches))")
            ax_age_wt.set_ylabel("log(Weight (pounds))")
        else:
            ax_age_ht.set_ylabel("Height (inches)")
            ax_age_wt.set_ylabel("Weight (pounds)")              
        ax_age_bmi.set_ylabel("BMI")

        ax_velo.set_xlabel("Age (years)")
        ax_velo.set_ylabel("d/dt(log(variable))")
        ax_velo.legend(loc = "lower right", prop={'size':font_size})

        ## Set title
        if not plot_for_blog:
            fig.suptitle("Patient #" + str(int(name_patient)) + " Progressing through Adiposity Rebound", fontsize = font_size)
        else:
            fig.suptitle("Individual Patient with Adiposity Rebound", fontsize = font_size)

        font = {'family' : 'normal', 'weight' : 'normal', 'size'   : font_size}
        plt.rc('font', **font)

    ## Create DataFrame with first derivatives
    df_derivs = pd.DataFrame({"age":xnew, "ht_velo": ynew_ht_log_velo, "wt_velo": ynew_wt_log_velo, "bmi_velo": bmi_velo})
    
    ## Adiposity rebound occurs when velocity of log(BMI) goes from negative to
    ## to positive. If the velocity stays positive, the ratio curve opens up,
    ## and thus the minimum of the curve is the adiposity rebound

    ## Default value of ar_age, indicating it could not be found
    ar_age = np.nan
    
    ## CASE 1: BMI derivative stays above 0. Pick the minimum.
    if df_derivs["bmi_velo"].min() > 0:
        ar_age = df_derivs["age"].ix[df_derivs["bmi_velo"].idxmin()]
        if plot_bool:
            ax_velo.axvline(x=ar_age)
        #print "here1"
        ar_find = 1

    ## CASE 2: BMI derivative stays below 0. Pick the maximum.
    elif df_derivs["bmi_velo"].max() < 0:
        ar_age = df_derivs["age"].ix[df_derivs["bmi_velo"].idxmax()]
        if plot_bool:
            ax_velo.axvline(x=ar_age)
        #print "here2"
        ar_find = 2
    
    ## Note that the BMI derivative might cross 0 twice
    else:
        ## Find the index of the datapoint with the largest BMI derivative
        max_bmi_velo_index = df_derivs["bmi_velo"].idxmax()
        min_bmi_velo_index = df_derivs["bmi_velo"].idxmin()

        ## Find the largest index
        largest_index = max(df_derivs["bmi_velo"].index)

        ## CASE 3: BMI curve starts > 0, ends < 0. Unusual because that means
        ## BMI goes through maximum. Maybe pick earliest age as AR?
        if df_derivs["bmi_velo"].iloc[0] > 0 and df_derivs["bmi_velo"].iloc[-1] < 0:
            ar_find = 3
            df_derivs_cut = df_derivs

        ## CASE 4: BMI curve starts < 0, ends > 0. 
        elif df_derivs["bmi_velo"].iloc[0] < 0 and df_derivs["bmi_velo"].iloc[-1] > 0:
            ar_find = 4
            df_derivs_cut = df_derivs.ix[min_bmi_velo_index:largest_index]
            
        ## CASE 5: BMI curve starts and ends > 0, and opens up, crossing 0 twice
        ## So extract portion from [minimum, right end] when BMI derivative crosses
        ## from negative to positive
        elif df_derivs["bmi_velo"].iloc[0] > 0 and df_derivs["bmi_velo"].iloc[-1] > 0:
            ar_find = 5
            df_derivs_cut = df_derivs.ix[min_bmi_velo_index:largest_index]

        ## CASE 6: BMI curve starts and ends < 0, and opens down, crossing 0 twice
        ## So extract portion from [left end, maximum] when BMI derivative crosses
        ## from negative to positive
        else:
            ar_find = 6
            df_derivs_cut = df_derivs.ix[0:max_bmi_velo_index]
            
        ## Linearly interpolate age on first derivative of log(bmi)
        #f = interpolate.InterpolatedUnivariateSpline(df_derivs_cut["bmi_velo"], df_derivs_cut["age"])
        f = interpolate.interp1d(df_derivs_cut["bmi_velo"], df_derivs_cut["age"])

        ## Try to find age at which first derivative of log(bmi) == 0
        try:
            ar_age = f(0)
            if plot_bool:
                ax_velo.axvline(x=ar_age)
                if plot_for_blog:
                    ax_age_bmi.axvline(x=ar_age, color = "red", ls = "--")

        ## If cannot interpolate, means that either no ages or two ages were found. Thus, cannot determine AR.
        except ValueError as inst:
            print "ERROR: " + str(name_patient) + " - " + inst.args[0]
        
    if plot_bool:
        ## Plot when patient's indvidual curve over population growth curves
        #vis_quartiles_individual.plot_individual_against_percentiles(name_patient)

        
        ## Display plot
        plt.show() #bbox_inches='tight', transparent="True", pad_inches=0

    ## Now calculate percentile patient is end at the end of his/her growth curve
            
    ## Use resampled dataset, and group by patients
    grouped_resampled_patient = df_resampled.groupby(["id"])
    grouped_resampled_patient_age = df_resampled.groupby(["id", "age"])

    ## Select the last age datapoint, and record age, height, weight, and BMI
    last_resampled_dpt = grouped_resampled_patient.get_group(name_patient).sort("age").iloc[-1]
    last_age = last_resampled_dpt["age"]
    last_ht = last_resampled_dpt["ht"]
    last_wt = last_resampled_dpt["wt"]
    last_bmi = last_resampled_dpt["bmi"]

    first_resampled_dpt = grouped_resampled_patient.get_group(name_patient).sort("age").iloc[0]
    first_age = first_resampled_dpt["age"]

    ## Calculate weight, height, BMI percentiles at AR, if AR was found

    if np.isnan(ar_age):
        ar_ht_perc = np.nan
        ar_wt_perc = np.nan
        ar_bmi_perc = np.nan

        ar_ht_res = np.nan
        ar_wt_res = np.nan
        ar_bmi_res = np.nan
        
    else:
        interval_index_right = np.searchsorted(intervals, ar_age)
        
        if ar_age in intervals:
            (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, ar_age, resampled_grouped_age, resampled_grouped_patient_age)
        else:
            interval_index_left = interval_index_right - 1
            age_left = intervals[interval_index_left]
            age_right = intervals[interval_index_right]

            if age_left < group_patient["age"].min():
                (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, age_right, resampled_grouped_age, resampled_grouped_patient_age)
                
            elif age_right > group_patient["age"].max():
                (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, age_left, resampled_grouped_age, resampled_grouped_patient_age)
                
            else:
                (ht_left_perc, wt_left_perc, bmi_left_perc, ar_ht_left_res, ar_wt_left_res, ar_bmi_left_res) = \
                               find_percentile(name_patient, age_left, resampled_grouped_age, resampled_grouped_patient_age)
                (ht_right_perc, wt_right_perc, bmi_right_perc, ar_ht_right_res, ar_wt_right_res, ar_bmi_right_res) = \
                                find_percentile(name_patient, age_right, resampled_grouped_age, resampled_grouped_patient_age)
                    
                right_ratio = (ar_age - age_left)/(age_right - age_left)
                left_ratio = 1 - right_ratio
                ar_ht_perc = ht_left_perc * left_ratio + ht_right_perc * right_ratio
                ar_wt_perc = wt_left_perc * left_ratio + wt_right_perc * right_ratio
                ar_bmi_perc = bmi_left_perc * left_ratio + bmi_right_perc * right_ratio
                
                ar_ht_res = ar_ht_left_res * left_ratio + ar_ht_right_res * right_ratio
                ar_wt_res = ar_wt_left_res * left_ratio + ar_wt_right_res * right_ratio
                ar_bmi_res = ar_bmi_left_res * left_ratio + ar_bmi_right_res * right_ratio

    ## Create new numpy list that is of the correct dimensions
    #row = np.array([None]*len(header_list))
    #row = np.reshape(row, (1, len(header_list)))

    ## Create new DataFrame row
    #df_row = pd.DataFrame(row, columns = header_list)
    df_row = dict()

    ## Save patient information to this new row
    df_row["id"] = name_patient
    df_row["gender"] = group_patient["gender"].iloc[0]
    df_row["race_ethnicity"] = group_patient["race_ethnicity"].iloc[0]
    df_row["count"] = num_x
    df_row["ar_find"] = ar_find
    df_row["age_ar"] = ar_age
    df_row["age_front"] = first_age
    df_row["age_end"] = last_age

    ## Find and save height, weight, BMI percentiles at the last resampled datapoint
    (df_row["perc_ht_end"], df_row["perc_wt_end"], df_row["perc_bmi_end"], \
     df_row["res_ht_end"], df_row["res_wt_end"], df_row["res_bmi_end"]) = \
                            find_percentile(name_patient, last_age, resampled_grouped_age, resampled_grouped_patient_age)

    ## Find and save height, weight, BMI percentiles at the first resampled datapoint
    (df_row["perc_ht_front"], df_row["perc_wt_front"], df_row["perc_bmi_front"], \
     df_row["res_ht_front"], df_row["res_wt_front"], df_row["res_bmi_front"]) = \
                            find_percentile(name_patient, first_age, resampled_grouped_age, resampled_grouped_patient_age)

    ## Seight, weight, BMI percentiles at AR
    (df_row["perc_ht_ar"], df_row["perc_wt_ar"], df_row["perc_bmi_ar"]) = \
                            (ar_ht_perc, ar_wt_perc, ar_bmi_perc)
    (df_row["res_ht_ar"], df_row["res_wt_ar"], df_row["res_bmi_ar"]) = \
                            (ar_ht_res, ar_wt_res, ar_bmi_res)

    ## Append new row to aggregate AR info dataframe
    df_ar_list.append(df_row)
    #df_ar_info = pd.DataFrame.append(df_ar_info, df_row)

    if input_patient_bool:
        break
df_ar_info = pd.DataFrame(df_ar_list)
    
if not input_patient_bool:
    ## Dump data
    #output = open('../../data/pkl/BMI_ar_4_onlymethod4_calculate_5pt.pkl', 'wb')
    output = open('../../data/pkl/BMI_filtered_contain_age5_ar_calculate.pkl', 'wb')
    pickle.dump(df_ar_info, output, -1)
    output.close()

    #df_ar_info.to_csv("../../data/csv/BMI_ar_4_onlymethod4_calculate_5pt.csv")
    df_ar_info.to_csv("../../data/csv/BMI_filtered_contain_age5_ar_calculate.csv")