Switch to side-by-side view

--- a
+++ b/code/adiposity_rebound/ar_calc.py
@@ -0,0 +1,425 @@
+##### SETUP ######
+
+#import sys
+#sys.path.append('../visualize')
+#import vis_quartiles_individual
+
+import pickle
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+import statsmodels.api as sm
+from scipy import interpolate
+import math
+
+##### VARIABLES ######
+
+attributes = ["ht","wt","bmi"]
+percentiles = np.array([3, 5, 10, 25, 50, 75, 85, 90, 95, 97])
+intervals = np.trunc(np.concatenate((np.array([0.01]),np.arange(.05,.2,.05),np.arange(.4,2,.2), np.arange(2,20,.5)))*100)/100
+
+## Plot figures?
+plot_bool = False
+
+## Plot for blog style if plot_bool is true?
+plot_for_blog = False
+
+## Input individual patients? (Otherwise pulls all patients from BMI_filtered_contain_age5.pkl)
+input_patient_bool = False
+
+## Individual patient to analyze if input_patient_bool is true
+patient_id = 16785 #16785, 10258, 9026, 12322, 11995, 12552
+
+## Limit number of patients plot
+count_bool = False
+
+## Plot log on y axis?
+plot_log = False
+font_size = 14
+
+
+######################
+
+##### FUNCTIONS ######
+
+def find_percentile(id_patient, age, grouped_age, grouped_patient_age):                                     
+    df_datapoint = grouped_patient_age.get_group((id_patient, age))
+    ht = df_datapoint["ht"]
+    wt = df_datapoint["wt"]
+    bmi = df_datapoint["bmi"]
+    gender = df_datapoint["gender"]
+                                     
+    group_age = grouped_age.get_group(age)
+    group_age = group_age[group_age["gender"] == gender.iloc[0]]
+    
+    num_patients = group_age.shape[0]
+    percent_ht = np.sum(group_age["ht"] < ht.iloc[0])*100.0/num_patients
+    percent_wt = np.sum(group_age["wt"] < wt.iloc[0])*100.0/num_patients
+    percent_bmi = np.sum(group_age["bmi"] < bmi.iloc[0])*100.0/num_patients
+    return (percent_ht, percent_wt, percent_bmi, ht.iloc[0], wt.iloc[0], bmi.iloc[0])
+
+######################
+
+## Open pickle file, saved from bmi_adiposity_rebound_filter.py
+## df_range contains individuals that have data in the age range where AR occurs
+df = pickle.load(open('../../data/pkl/BMI_filtered_contain_age5.pkl', 'rb'))
+
+## Open pickle file, saved from bmi_initial_processing.py
+df_resampled = pickle.load(open('../../data/pkl/BMI_resampled.pkl', 'rb'))
+
+## Already ran calculation ##
+## Keep only method 4 results ##
+#pat_list = pd.read_csv("../../data/csv/BMI_ar_4_method4.csv", names = ["Patient ID"])
+#df = df[df["Patient ID"].isin(pat_list["Patient ID"])]
+
+## Group datapoints for each patient
+grouped = df.groupby(["id"])
+
+resampled_grouped_age = df_resampled.groupby(["age"])
+resampled_grouped_patient_age = df_resampled.groupby(["id", "age"])
+
+## Create new list to add AR information for each patient
+df_ar_list = []
+                           
+## Iterate through each patient
+count = 0
+for name_patient, group_patient in grouped:
+
+    #print name_patient
+    if count_bool:
+        count = count + 1
+        if count < 0:
+            continue
+        if count == 20:
+            break
+    if input_patient_bool:
+        name_patient = patient_id    
+        group_patient = grouped.get_group(name_patient)
+    #print name_patient
+    #print group_patient
+
+    # Throw out data less than 2 years
+    group_patient = group_patient[group_patient["age"] >= 2.0]
+    
+    if plot_bool and not plot_for_blog:
+        ## Create figure to hold patient-specific plots 
+        fig = plt.figure()
+        ax_age_ht = fig.add_subplot(221)
+        ax_age_wt = fig.add_subplot(222)
+        ax_age_bmi = fig.add_subplot(223)
+        ax_velo = fig.add_subplot(224) ## First derivative of log(ht), log(wt), and log(bmi)
+    elif plot_for_blog:
+        fig = plt.figure(1, figsize=(15, 4.75))
+        fig.subplots_adjust(bottom=0.15, left=0.05, right=0.975)
+        ax_age_ht = fig.add_subplot(131)
+        ax_age_wt = fig.add_subplot(132) 
+        ax_age_bmi = fig.add_subplot(133)
+        fig2 = plt.figure()
+        ax_velo = fig2.add_subplot(111)
+        
+    ## Count number of datapoints for this patient
+    num_x = group_patient.shape[0]
+    
+    # And at least 5 datapoints?
+    if num_x < 5:
+        continue
+    
+    ## Regression of log(ht) on age
+    x = group_patient["age"]
+    log_ht = np.log(group_patient["ht"])
+    y = log_ht
+
+    ## Model with cubic equation
+    X = np.column_stack((x*x*x, x*x, x, np.ones(num_x)))
+
+    ## Make OLS model
+    model = sm.OLS(y,X)
+    results = model.fit()
+
+    ## Sample fit
+    b = results.params
+    xnew = np.linspace(x.min(),x.max(),100)
+    ynew_ht_log = b[0]*xnew*xnew*xnew + b[1]*xnew*xnew + b[2]*xnew + b[3]
+    ynew_ht = np.power(math.e, ynew_ht_log)
+
+    ## Calculate first derivative of log(ht)
+    ynew_ht_log_velo = 3*b[0]*xnew*xnew + 2*b[1]*xnew + b[2]
+
+    if plot_bool:
+        ax_velo.plot(xnew,2* ynew_ht_log_velo, 'r-', label="2*d(log(Height))/dt")
+        if plot_log:
+            ## Plot original data points and fit on log-linear plot
+            ax_age_ht.plot(x,y, 'ro')
+            ax_age_ht.plot(xnew,ynew_ht_log, 'r-')
+        else:
+            ## Plot original data points and fit on lin-lin plot
+            ax_age_ht.plot(x,group_patient["ht"], 'ro')
+            ax_age_ht.plot(xnew,ynew_ht, 'r-')
+
+    ## Regression of log(wt) on age
+    x = group_patient["age"]
+    log_wt = np.log(group_patient["wt"])
+    y = log_wt
+
+    ## Model with cubic equation
+    X = np.column_stack((x*x*x, x*x, x, np.ones(num_x)))
+
+    ## Make RLM model
+    #model = sm.OLS(y,X)
+    model = sm.RLM(y,X)
+
+    ## Sample fit
+    results = model.fit()
+    b = results.params
+    xnew = np.linspace(x.min(),x.max(),100)
+    ynew_wt_log = b[0]*xnew*xnew*xnew + b[1]*xnew*xnew + b[2]*xnew + b[3]
+    ynew_wt = np.power(math.e, ynew_wt_log)
+
+    ## Calculate first derivative of log(ht)
+    ynew_wt_log_velo = 3*b[0]*xnew*xnew + 2*b[1]*xnew + b[2]
+
+    if plot_bool:
+        ax_velo.plot(xnew,ynew_wt_log_velo, 'b-', label="d(log(Weight))/dt")
+        if plot_log:
+            ## Plot original data points and fit on lin-lin plot
+            ax_age_wt.plot(x,y, 'bo')
+            ax_age_wt.plot(xnew,ynew_wt_log, 'b-')
+        else:
+            ## Plot original data points and fit on log-linear plot
+            ax_age_wt.plot(x,group_patient["wt"], 'bo')
+            ax_age_wt.plot(xnew,ynew_wt, 'b-')
+
+    ## Calculate first derivative of log(bmi) from first derivatives of log(wt) and log(ht)
+    bmi_velo = np.subtract(ynew_wt_log_velo, 2*ynew_ht_log_velo)
+
+    if plot_bool:
+        ## Plot first derivative of log(bmi) on log-linear plot
+        ax_velo.plot(xnew, bmi_velo, 'g-', label = "d(log(BMI))/dt")
+        ax_velo.axhline(y=0)
+
+        ## Plot original BMI datapoints
+        ax_age_bmi.plot(group_patient["age"],group_patient["bmi"], 'go')
+
+        ynew_bmi = np.divide(ynew_wt, np.power(ynew_ht, 2)) * 703
+        ax_age_bmi.plot(xnew, ynew_bmi, 'g-')
+   
+        ## Set axis labels
+        ax_age_ht.set_xlabel("Age (years)")
+        ax_age_wt.set_xlabel("Age (years)")
+        ax_age_bmi.set_xlabel("Age (years)")
+        if plot_log:
+            ax_age_ht.set_ylabel("log(Height (inches))")
+            ax_age_wt.set_ylabel("log(Weight (pounds))")
+        else:
+            ax_age_ht.set_ylabel("Height (inches)")
+            ax_age_wt.set_ylabel("Weight (pounds)")              
+        ax_age_bmi.set_ylabel("BMI")
+
+        ax_velo.set_xlabel("Age (years)")
+        ax_velo.set_ylabel("d/dt(log(variable))")
+        ax_velo.legend(loc = "lower right", prop={'size':font_size})
+
+        ## Set title
+        if not plot_for_blog:
+            fig.suptitle("Patient #" + str(int(name_patient)) + " Progressing through Adiposity Rebound", fontsize = font_size)
+        else:
+            fig.suptitle("Individual Patient with Adiposity Rebound", fontsize = font_size)
+
+        font = {'family' : 'normal', 'weight' : 'normal', 'size'   : font_size}
+        plt.rc('font', **font)
+
+    ## Create DataFrame with first derivatives
+    df_derivs = pd.DataFrame({"age":xnew, "ht_velo": ynew_ht_log_velo, "wt_velo": ynew_wt_log_velo, "bmi_velo": bmi_velo})
+    
+    ## Adiposity rebound occurs when velocity of log(BMI) goes from negative to
+    ## to positive. If the velocity stays positive, the ratio curve opens up,
+    ## and thus the minimum of the curve is the adiposity rebound
+
+    ## Default value of ar_age, indicating it could not be found
+    ar_age = np.nan
+    
+    ## CASE 1: BMI derivative stays above 0. Pick the minimum.
+    if df_derivs["bmi_velo"].min() > 0:
+        ar_age = df_derivs["age"].ix[df_derivs["bmi_velo"].idxmin()]
+        if plot_bool:
+            ax_velo.axvline(x=ar_age)
+        #print "here1"
+        ar_find = 1
+
+    ## CASE 2: BMI derivative stays below 0. Pick the maximum.
+    elif df_derivs["bmi_velo"].max() < 0:
+        ar_age = df_derivs["age"].ix[df_derivs["bmi_velo"].idxmax()]
+        if plot_bool:
+            ax_velo.axvline(x=ar_age)
+        #print "here2"
+        ar_find = 2
+    
+    ## Note that the BMI derivative might cross 0 twice
+    else:
+        ## Find the index of the datapoint with the largest BMI derivative
+        max_bmi_velo_index = df_derivs["bmi_velo"].idxmax()
+        min_bmi_velo_index = df_derivs["bmi_velo"].idxmin()
+
+        ## Find the largest index
+        largest_index = max(df_derivs["bmi_velo"].index)
+
+        ## CASE 3: BMI curve starts > 0, ends < 0. Unusual because that means
+        ## BMI goes through maximum. Maybe pick earliest age as AR?
+        if df_derivs["bmi_velo"].iloc[0] > 0 and df_derivs["bmi_velo"].iloc[-1] < 0:
+            ar_find = 3
+            df_derivs_cut = df_derivs
+
+        ## CASE 4: BMI curve starts < 0, ends > 0. 
+        elif df_derivs["bmi_velo"].iloc[0] < 0 and df_derivs["bmi_velo"].iloc[-1] > 0:
+            ar_find = 4
+            df_derivs_cut = df_derivs.ix[min_bmi_velo_index:largest_index]
+            
+        ## CASE 5: BMI curve starts and ends > 0, and opens up, crossing 0 twice
+        ## So extract portion from [minimum, right end] when BMI derivative crosses
+        ## from negative to positive
+        elif df_derivs["bmi_velo"].iloc[0] > 0 and df_derivs["bmi_velo"].iloc[-1] > 0:
+            ar_find = 5
+            df_derivs_cut = df_derivs.ix[min_bmi_velo_index:largest_index]
+
+        ## CASE 6: BMI curve starts and ends < 0, and opens down, crossing 0 twice
+        ## So extract portion from [left end, maximum] when BMI derivative crosses
+        ## from negative to positive
+        else:
+            ar_find = 6
+            df_derivs_cut = df_derivs.ix[0:max_bmi_velo_index]
+            
+        ## Linearly interpolate age on first derivative of log(bmi)
+        #f = interpolate.InterpolatedUnivariateSpline(df_derivs_cut["bmi_velo"], df_derivs_cut["age"])
+        f = interpolate.interp1d(df_derivs_cut["bmi_velo"], df_derivs_cut["age"])
+
+        ## Try to find age at which first derivative of log(bmi) == 0
+        try:
+            ar_age = f(0)
+            if plot_bool:
+                ax_velo.axvline(x=ar_age)
+                if plot_for_blog:
+                    ax_age_bmi.axvline(x=ar_age, color = "red", ls = "--")
+
+        ## If cannot interpolate, means that either no ages or two ages were found. Thus, cannot determine AR.
+        except ValueError as inst:
+            print "ERROR: " + str(name_patient) + " - " + inst.args[0]
+        
+    if plot_bool:
+        ## Plot when patient's indvidual curve over population growth curves
+        #vis_quartiles_individual.plot_individual_against_percentiles(name_patient)
+
+        
+        ## Display plot
+        plt.show() #bbox_inches='tight', transparent="True", pad_inches=0
+
+    ## Now calculate percentile patient is end at the end of his/her growth curve
+            
+    ## Use resampled dataset, and group by patients
+    grouped_resampled_patient = df_resampled.groupby(["id"])
+    grouped_resampled_patient_age = df_resampled.groupby(["id", "age"])
+
+    ## Select the last age datapoint, and record age, height, weight, and BMI
+    last_resampled_dpt = grouped_resampled_patient.get_group(name_patient).sort("age").iloc[-1]
+    last_age = last_resampled_dpt["age"]
+    last_ht = last_resampled_dpt["ht"]
+    last_wt = last_resampled_dpt["wt"]
+    last_bmi = last_resampled_dpt["bmi"]
+
+    first_resampled_dpt = grouped_resampled_patient.get_group(name_patient).sort("age").iloc[0]
+    first_age = first_resampled_dpt["age"]
+
+    ## Calculate weight, height, BMI percentiles at AR, if AR was found
+
+    if np.isnan(ar_age):
+        ar_ht_perc = np.nan
+        ar_wt_perc = np.nan
+        ar_bmi_perc = np.nan
+
+        ar_ht_res = np.nan
+        ar_wt_res = np.nan
+        ar_bmi_res = np.nan
+        
+    else:
+        interval_index_right = np.searchsorted(intervals, ar_age)
+        
+        if ar_age in intervals:
+            (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, ar_age, resampled_grouped_age, resampled_grouped_patient_age)
+        else:
+            interval_index_left = interval_index_right - 1
+            age_left = intervals[interval_index_left]
+            age_right = intervals[interval_index_right]
+
+            if age_left < group_patient["age"].min():
+                (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, age_right, resampled_grouped_age, resampled_grouped_patient_age)
+                
+            elif age_right > group_patient["age"].max():
+                (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, age_left, resampled_grouped_age, resampled_grouped_patient_age)
+                
+            else:
+                (ht_left_perc, wt_left_perc, bmi_left_perc, ar_ht_left_res, ar_wt_left_res, ar_bmi_left_res) = \
+                               find_percentile(name_patient, age_left, resampled_grouped_age, resampled_grouped_patient_age)
+                (ht_right_perc, wt_right_perc, bmi_right_perc, ar_ht_right_res, ar_wt_right_res, ar_bmi_right_res) = \
+                                find_percentile(name_patient, age_right, resampled_grouped_age, resampled_grouped_patient_age)
+                    
+                right_ratio = (ar_age - age_left)/(age_right - age_left)
+                left_ratio = 1 - right_ratio
+                ar_ht_perc = ht_left_perc * left_ratio + ht_right_perc * right_ratio
+                ar_wt_perc = wt_left_perc * left_ratio + wt_right_perc * right_ratio
+                ar_bmi_perc = bmi_left_perc * left_ratio + bmi_right_perc * right_ratio
+                
+                ar_ht_res = ar_ht_left_res * left_ratio + ar_ht_right_res * right_ratio
+                ar_wt_res = ar_wt_left_res * left_ratio + ar_wt_right_res * right_ratio
+                ar_bmi_res = ar_bmi_left_res * left_ratio + ar_bmi_right_res * right_ratio
+
+    ## Create new numpy list that is of the correct dimensions
+    #row = np.array([None]*len(header_list))
+    #row = np.reshape(row, (1, len(header_list)))
+
+    ## Create new DataFrame row
+    #df_row = pd.DataFrame(row, columns = header_list)
+    df_row = dict()
+
+    ## Save patient information to this new row
+    df_row["id"] = name_patient
+    df_row["gender"] = group_patient["gender"].iloc[0]
+    df_row["race_ethnicity"] = group_patient["race_ethnicity"].iloc[0]
+    df_row["count"] = num_x
+    df_row["ar_find"] = ar_find
+    df_row["age_ar"] = ar_age
+    df_row["age_front"] = first_age
+    df_row["age_end"] = last_age
+
+    ## Find and save height, weight, BMI percentiles at the last resampled datapoint
+    (df_row["perc_ht_end"], df_row["perc_wt_end"], df_row["perc_bmi_end"], \
+     df_row["res_ht_end"], df_row["res_wt_end"], df_row["res_bmi_end"]) = \
+                            find_percentile(name_patient, last_age, resampled_grouped_age, resampled_grouped_patient_age)
+
+    ## Find and save height, weight, BMI percentiles at the first resampled datapoint
+    (df_row["perc_ht_front"], df_row["perc_wt_front"], df_row["perc_bmi_front"], \
+     df_row["res_ht_front"], df_row["res_wt_front"], df_row["res_bmi_front"]) = \
+                            find_percentile(name_patient, first_age, resampled_grouped_age, resampled_grouped_patient_age)
+
+    ## Seight, weight, BMI percentiles at AR
+    (df_row["perc_ht_ar"], df_row["perc_wt_ar"], df_row["perc_bmi_ar"]) = \
+                            (ar_ht_perc, ar_wt_perc, ar_bmi_perc)
+    (df_row["res_ht_ar"], df_row["res_wt_ar"], df_row["res_bmi_ar"]) = \
+                            (ar_ht_res, ar_wt_res, ar_bmi_res)
+
+    ## Append new row to aggregate AR info dataframe
+    df_ar_list.append(df_row)
+    #df_ar_info = pd.DataFrame.append(df_ar_info, df_row)
+
+    if input_patient_bool:
+        break
+df_ar_info = pd.DataFrame(df_ar_list)
+    
+if not input_patient_bool:
+    ## Dump data
+    #output = open('../../data/pkl/BMI_ar_4_onlymethod4_calculate_5pt.pkl', 'wb')
+    output = open('../../data/pkl/BMI_filtered_contain_age5_ar_calculate.pkl', 'wb')
+    pickle.dump(df_ar_info, output, -1)
+    output.close()
+
+    #df_ar_info.to_csv("../../data/csv/BMI_ar_4_onlymethod4_calculate_5pt.csv")
+    df_ar_info.to_csv("../../data/csv/BMI_filtered_contain_age5_ar_calculate.csv")