--- a +++ b/code/adiposity_rebound/ar_calc.py @@ -0,0 +1,425 @@ +##### SETUP ###### + +#import sys +#sys.path.append('../visualize') +#import vis_quartiles_individual + +import pickle +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import statsmodels.api as sm +from scipy import interpolate +import math + +##### VARIABLES ###### + +attributes = ["ht","wt","bmi"] +percentiles = np.array([3, 5, 10, 25, 50, 75, 85, 90, 95, 97]) +intervals = np.trunc(np.concatenate((np.array([0.01]),np.arange(.05,.2,.05),np.arange(.4,2,.2), np.arange(2,20,.5)))*100)/100 + +## Plot figures? +plot_bool = False + +## Plot for blog style if plot_bool is true? +plot_for_blog = False + +## Input individual patients? (Otherwise pulls all patients from BMI_filtered_contain_age5.pkl) +input_patient_bool = False + +## Individual patient to analyze if input_patient_bool is true +patient_id = 16785 #16785, 10258, 9026, 12322, 11995, 12552 + +## Limit number of patients plot +count_bool = False + +## Plot log on y axis? +plot_log = False +font_size = 14 + + +###################### + +##### FUNCTIONS ###### + +def find_percentile(id_patient, age, grouped_age, grouped_patient_age): + df_datapoint = grouped_patient_age.get_group((id_patient, age)) + ht = df_datapoint["ht"] + wt = df_datapoint["wt"] + bmi = df_datapoint["bmi"] + gender = df_datapoint["gender"] + + group_age = grouped_age.get_group(age) + group_age = group_age[group_age["gender"] == gender.iloc[0]] + + num_patients = group_age.shape[0] + percent_ht = np.sum(group_age["ht"] < ht.iloc[0])*100.0/num_patients + percent_wt = np.sum(group_age["wt"] < wt.iloc[0])*100.0/num_patients + percent_bmi = np.sum(group_age["bmi"] < bmi.iloc[0])*100.0/num_patients + return (percent_ht, percent_wt, percent_bmi, ht.iloc[0], wt.iloc[0], bmi.iloc[0]) + +###################### + +## Open pickle file, saved from bmi_adiposity_rebound_filter.py +## df_range contains individuals that have data in the age range where AR occurs +df = pickle.load(open('../../data/pkl/BMI_filtered_contain_age5.pkl', 'rb')) + +## Open pickle file, saved from bmi_initial_processing.py +df_resampled = pickle.load(open('../../data/pkl/BMI_resampled.pkl', 'rb')) + +## Already ran calculation ## +## Keep only method 4 results ## +#pat_list = pd.read_csv("../../data/csv/BMI_ar_4_method4.csv", names = ["Patient ID"]) +#df = df[df["Patient ID"].isin(pat_list["Patient ID"])] + +## Group datapoints for each patient +grouped = df.groupby(["id"]) + +resampled_grouped_age = df_resampled.groupby(["age"]) +resampled_grouped_patient_age = df_resampled.groupby(["id", "age"]) + +## Create new list to add AR information for each patient +df_ar_list = [] + +## Iterate through each patient +count = 0 +for name_patient, group_patient in grouped: + + #print name_patient + if count_bool: + count = count + 1 + if count < 0: + continue + if count == 20: + break + if input_patient_bool: + name_patient = patient_id + group_patient = grouped.get_group(name_patient) + #print name_patient + #print group_patient + + # Throw out data less than 2 years + group_patient = group_patient[group_patient["age"] >= 2.0] + + if plot_bool and not plot_for_blog: + ## Create figure to hold patient-specific plots + fig = plt.figure() + ax_age_ht = fig.add_subplot(221) + ax_age_wt = fig.add_subplot(222) + ax_age_bmi = fig.add_subplot(223) + ax_velo = fig.add_subplot(224) ## First derivative of log(ht), log(wt), and log(bmi) + elif plot_for_blog: + fig = plt.figure(1, figsize=(15, 4.75)) + fig.subplots_adjust(bottom=0.15, left=0.05, right=0.975) + ax_age_ht = fig.add_subplot(131) + ax_age_wt = fig.add_subplot(132) + ax_age_bmi = fig.add_subplot(133) + fig2 = plt.figure() + ax_velo = fig2.add_subplot(111) + + ## Count number of datapoints for this patient + num_x = group_patient.shape[0] + + # And at least 5 datapoints? + if num_x < 5: + continue + + ## Regression of log(ht) on age + x = group_patient["age"] + log_ht = np.log(group_patient["ht"]) + y = log_ht + + ## Model with cubic equation + X = np.column_stack((x*x*x, x*x, x, np.ones(num_x))) + + ## Make OLS model + model = sm.OLS(y,X) + results = model.fit() + + ## Sample fit + b = results.params + xnew = np.linspace(x.min(),x.max(),100) + ynew_ht_log = b[0]*xnew*xnew*xnew + b[1]*xnew*xnew + b[2]*xnew + b[3] + ynew_ht = np.power(math.e, ynew_ht_log) + + ## Calculate first derivative of log(ht) + ynew_ht_log_velo = 3*b[0]*xnew*xnew + 2*b[1]*xnew + b[2] + + if plot_bool: + ax_velo.plot(xnew,2* ynew_ht_log_velo, 'r-', label="2*d(log(Height))/dt") + if plot_log: + ## Plot original data points and fit on log-linear plot + ax_age_ht.plot(x,y, 'ro') + ax_age_ht.plot(xnew,ynew_ht_log, 'r-') + else: + ## Plot original data points and fit on lin-lin plot + ax_age_ht.plot(x,group_patient["ht"], 'ro') + ax_age_ht.plot(xnew,ynew_ht, 'r-') + + ## Regression of log(wt) on age + x = group_patient["age"] + log_wt = np.log(group_patient["wt"]) + y = log_wt + + ## Model with cubic equation + X = np.column_stack((x*x*x, x*x, x, np.ones(num_x))) + + ## Make RLM model + #model = sm.OLS(y,X) + model = sm.RLM(y,X) + + ## Sample fit + results = model.fit() + b = results.params + xnew = np.linspace(x.min(),x.max(),100) + ynew_wt_log = b[0]*xnew*xnew*xnew + b[1]*xnew*xnew + b[2]*xnew + b[3] + ynew_wt = np.power(math.e, ynew_wt_log) + + ## Calculate first derivative of log(ht) + ynew_wt_log_velo = 3*b[0]*xnew*xnew + 2*b[1]*xnew + b[2] + + if plot_bool: + ax_velo.plot(xnew,ynew_wt_log_velo, 'b-', label="d(log(Weight))/dt") + if plot_log: + ## Plot original data points and fit on lin-lin plot + ax_age_wt.plot(x,y, 'bo') + ax_age_wt.plot(xnew,ynew_wt_log, 'b-') + else: + ## Plot original data points and fit on log-linear plot + ax_age_wt.plot(x,group_patient["wt"], 'bo') + ax_age_wt.plot(xnew,ynew_wt, 'b-') + + ## Calculate first derivative of log(bmi) from first derivatives of log(wt) and log(ht) + bmi_velo = np.subtract(ynew_wt_log_velo, 2*ynew_ht_log_velo) + + if plot_bool: + ## Plot first derivative of log(bmi) on log-linear plot + ax_velo.plot(xnew, bmi_velo, 'g-', label = "d(log(BMI))/dt") + ax_velo.axhline(y=0) + + ## Plot original BMI datapoints + ax_age_bmi.plot(group_patient["age"],group_patient["bmi"], 'go') + + ynew_bmi = np.divide(ynew_wt, np.power(ynew_ht, 2)) * 703 + ax_age_bmi.plot(xnew, ynew_bmi, 'g-') + + ## Set axis labels + ax_age_ht.set_xlabel("Age (years)") + ax_age_wt.set_xlabel("Age (years)") + ax_age_bmi.set_xlabel("Age (years)") + if plot_log: + ax_age_ht.set_ylabel("log(Height (inches))") + ax_age_wt.set_ylabel("log(Weight (pounds))") + else: + ax_age_ht.set_ylabel("Height (inches)") + ax_age_wt.set_ylabel("Weight (pounds)") + ax_age_bmi.set_ylabel("BMI") + + ax_velo.set_xlabel("Age (years)") + ax_velo.set_ylabel("d/dt(log(variable))") + ax_velo.legend(loc = "lower right", prop={'size':font_size}) + + ## Set title + if not plot_for_blog: + fig.suptitle("Patient #" + str(int(name_patient)) + " Progressing through Adiposity Rebound", fontsize = font_size) + else: + fig.suptitle("Individual Patient with Adiposity Rebound", fontsize = font_size) + + font = {'family' : 'normal', 'weight' : 'normal', 'size' : font_size} + plt.rc('font', **font) + + ## Create DataFrame with first derivatives + df_derivs = pd.DataFrame({"age":xnew, "ht_velo": ynew_ht_log_velo, "wt_velo": ynew_wt_log_velo, "bmi_velo": bmi_velo}) + + ## Adiposity rebound occurs when velocity of log(BMI) goes from negative to + ## to positive. If the velocity stays positive, the ratio curve opens up, + ## and thus the minimum of the curve is the adiposity rebound + + ## Default value of ar_age, indicating it could not be found + ar_age = np.nan + + ## CASE 1: BMI derivative stays above 0. Pick the minimum. + if df_derivs["bmi_velo"].min() > 0: + ar_age = df_derivs["age"].ix[df_derivs["bmi_velo"].idxmin()] + if plot_bool: + ax_velo.axvline(x=ar_age) + #print "here1" + ar_find = 1 + + ## CASE 2: BMI derivative stays below 0. Pick the maximum. + elif df_derivs["bmi_velo"].max() < 0: + ar_age = df_derivs["age"].ix[df_derivs["bmi_velo"].idxmax()] + if plot_bool: + ax_velo.axvline(x=ar_age) + #print "here2" + ar_find = 2 + + ## Note that the BMI derivative might cross 0 twice + else: + ## Find the index of the datapoint with the largest BMI derivative + max_bmi_velo_index = df_derivs["bmi_velo"].idxmax() + min_bmi_velo_index = df_derivs["bmi_velo"].idxmin() + + ## Find the largest index + largest_index = max(df_derivs["bmi_velo"].index) + + ## CASE 3: BMI curve starts > 0, ends < 0. Unusual because that means + ## BMI goes through maximum. Maybe pick earliest age as AR? + if df_derivs["bmi_velo"].iloc[0] > 0 and df_derivs["bmi_velo"].iloc[-1] < 0: + ar_find = 3 + df_derivs_cut = df_derivs + + ## CASE 4: BMI curve starts < 0, ends > 0. + elif df_derivs["bmi_velo"].iloc[0] < 0 and df_derivs["bmi_velo"].iloc[-1] > 0: + ar_find = 4 + df_derivs_cut = df_derivs.ix[min_bmi_velo_index:largest_index] + + ## CASE 5: BMI curve starts and ends > 0, and opens up, crossing 0 twice + ## So extract portion from [minimum, right end] when BMI derivative crosses + ## from negative to positive + elif df_derivs["bmi_velo"].iloc[0] > 0 and df_derivs["bmi_velo"].iloc[-1] > 0: + ar_find = 5 + df_derivs_cut = df_derivs.ix[min_bmi_velo_index:largest_index] + + ## CASE 6: BMI curve starts and ends < 0, and opens down, crossing 0 twice + ## So extract portion from [left end, maximum] when BMI derivative crosses + ## from negative to positive + else: + ar_find = 6 + df_derivs_cut = df_derivs.ix[0:max_bmi_velo_index] + + ## Linearly interpolate age on first derivative of log(bmi) + #f = interpolate.InterpolatedUnivariateSpline(df_derivs_cut["bmi_velo"], df_derivs_cut["age"]) + f = interpolate.interp1d(df_derivs_cut["bmi_velo"], df_derivs_cut["age"]) + + ## Try to find age at which first derivative of log(bmi) == 0 + try: + ar_age = f(0) + if plot_bool: + ax_velo.axvline(x=ar_age) + if plot_for_blog: + ax_age_bmi.axvline(x=ar_age, color = "red", ls = "--") + + ## If cannot interpolate, means that either no ages or two ages were found. Thus, cannot determine AR. + except ValueError as inst: + print "ERROR: " + str(name_patient) + " - " + inst.args[0] + + if plot_bool: + ## Plot when patient's indvidual curve over population growth curves + #vis_quartiles_individual.plot_individual_against_percentiles(name_patient) + + + ## Display plot + plt.show() #bbox_inches='tight', transparent="True", pad_inches=0 + + ## Now calculate percentile patient is end at the end of his/her growth curve + + ## Use resampled dataset, and group by patients + grouped_resampled_patient = df_resampled.groupby(["id"]) + grouped_resampled_patient_age = df_resampled.groupby(["id", "age"]) + + ## Select the last age datapoint, and record age, height, weight, and BMI + last_resampled_dpt = grouped_resampled_patient.get_group(name_patient).sort("age").iloc[-1] + last_age = last_resampled_dpt["age"] + last_ht = last_resampled_dpt["ht"] + last_wt = last_resampled_dpt["wt"] + last_bmi = last_resampled_dpt["bmi"] + + first_resampled_dpt = grouped_resampled_patient.get_group(name_patient).sort("age").iloc[0] + first_age = first_resampled_dpt["age"] + + ## Calculate weight, height, BMI percentiles at AR, if AR was found + + if np.isnan(ar_age): + ar_ht_perc = np.nan + ar_wt_perc = np.nan + ar_bmi_perc = np.nan + + ar_ht_res = np.nan + ar_wt_res = np.nan + ar_bmi_res = np.nan + + else: + interval_index_right = np.searchsorted(intervals, ar_age) + + if ar_age in intervals: + (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, ar_age, resampled_grouped_age, resampled_grouped_patient_age) + else: + interval_index_left = interval_index_right - 1 + age_left = intervals[interval_index_left] + age_right = intervals[interval_index_right] + + if age_left < group_patient["age"].min(): + (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, age_right, resampled_grouped_age, resampled_grouped_patient_age) + + elif age_right > group_patient["age"].max(): + (ar_ht_perc, ar_wt_perc, ar_bmi_perc, ar_ht_res, ar_wt_res, ar_bmi_res) = find_percentile(name_patient, age_left, resampled_grouped_age, resampled_grouped_patient_age) + + else: + (ht_left_perc, wt_left_perc, bmi_left_perc, ar_ht_left_res, ar_wt_left_res, ar_bmi_left_res) = \ + find_percentile(name_patient, age_left, resampled_grouped_age, resampled_grouped_patient_age) + (ht_right_perc, wt_right_perc, bmi_right_perc, ar_ht_right_res, ar_wt_right_res, ar_bmi_right_res) = \ + find_percentile(name_patient, age_right, resampled_grouped_age, resampled_grouped_patient_age) + + right_ratio = (ar_age - age_left)/(age_right - age_left) + left_ratio = 1 - right_ratio + ar_ht_perc = ht_left_perc * left_ratio + ht_right_perc * right_ratio + ar_wt_perc = wt_left_perc * left_ratio + wt_right_perc * right_ratio + ar_bmi_perc = bmi_left_perc * left_ratio + bmi_right_perc * right_ratio + + ar_ht_res = ar_ht_left_res * left_ratio + ar_ht_right_res * right_ratio + ar_wt_res = ar_wt_left_res * left_ratio + ar_wt_right_res * right_ratio + ar_bmi_res = ar_bmi_left_res * left_ratio + ar_bmi_right_res * right_ratio + + ## Create new numpy list that is of the correct dimensions + #row = np.array([None]*len(header_list)) + #row = np.reshape(row, (1, len(header_list))) + + ## Create new DataFrame row + #df_row = pd.DataFrame(row, columns = header_list) + df_row = dict() + + ## Save patient information to this new row + df_row["id"] = name_patient + df_row["gender"] = group_patient["gender"].iloc[0] + df_row["race_ethnicity"] = group_patient["race_ethnicity"].iloc[0] + df_row["count"] = num_x + df_row["ar_find"] = ar_find + df_row["age_ar"] = ar_age + df_row["age_front"] = first_age + df_row["age_end"] = last_age + + ## Find and save height, weight, BMI percentiles at the last resampled datapoint + (df_row["perc_ht_end"], df_row["perc_wt_end"], df_row["perc_bmi_end"], \ + df_row["res_ht_end"], df_row["res_wt_end"], df_row["res_bmi_end"]) = \ + find_percentile(name_patient, last_age, resampled_grouped_age, resampled_grouped_patient_age) + + ## Find and save height, weight, BMI percentiles at the first resampled datapoint + (df_row["perc_ht_front"], df_row["perc_wt_front"], df_row["perc_bmi_front"], \ + df_row["res_ht_front"], df_row["res_wt_front"], df_row["res_bmi_front"]) = \ + find_percentile(name_patient, first_age, resampled_grouped_age, resampled_grouped_patient_age) + + ## Seight, weight, BMI percentiles at AR + (df_row["perc_ht_ar"], df_row["perc_wt_ar"], df_row["perc_bmi_ar"]) = \ + (ar_ht_perc, ar_wt_perc, ar_bmi_perc) + (df_row["res_ht_ar"], df_row["res_wt_ar"], df_row["res_bmi_ar"]) = \ + (ar_ht_res, ar_wt_res, ar_bmi_res) + + ## Append new row to aggregate AR info dataframe + df_ar_list.append(df_row) + #df_ar_info = pd.DataFrame.append(df_ar_info, df_row) + + if input_patient_bool: + break +df_ar_info = pd.DataFrame(df_ar_list) + +if not input_patient_bool: + ## Dump data + #output = open('../../data/pkl/BMI_ar_4_onlymethod4_calculate_5pt.pkl', 'wb') + output = open('../../data/pkl/BMI_filtered_contain_age5_ar_calculate.pkl', 'wb') + pickle.dump(df_ar_info, output, -1) + output.close() + + #df_ar_info.to_csv("../../data/csv/BMI_ar_4_onlymethod4_calculate_5pt.csv") + df_ar_info.to_csv("../../data/csv/BMI_filtered_contain_age5_ar_calculate.csv")