Switch to unified view

a b/code/init_processing/init_processing.py
1
##### SETUP ######
2
3
import pickle
4
import numpy as np
5
import pandas as pd
6
import math
7
import scipy.interpolate as interpolate
8
import matplotlib.pyplot as plt
9
import pylab as P
10
import numpy as np
11
import statsmodels.api as sm
12
from scipy import stats 
13
    
14
## Set up age intervals
15
## Float instability created errors, so corrected the following trunction 
16
intervals = np.trunc(np.concatenate((np.array([0.01]),np.arange(.05,.2,.05),np.arange(.4,2,.2), np.arange(2,20,.5)))*100)/100
17
18
##################
19
20
##### VARIABLES #####
21
22
## Minimum number of datapoints for patient to pass filter
23
min_datapoints = 5
24
25
## CSV raw document locaiton
26
csv_location = "../../data/csv/BMI.csv"
27
28
## Individuals with outlier data
29
## 16262: First two datapoints when ages ~1.8 have heights of ~2 inches
30
bad_patients = [16262]
31
32
#####################
33
34
##### FUNCTIONS #####
35
36
## Change numeric datatypes to appropriate floats of ints
37
def change_dtypes(df):
38
    if "wt" in df.columns:
39
        df["wt"] = df["wt"].astype(np.float64)
40
    else:
41
        df["wt_ounces"] = df["wt_ounces"].astype(np.float64)
42
    df["age"] = df["age"].astype(np.float64)
43
    df["bmi"] = df["bmi"].astype(np.float64)
44
    df["id"] = df["id"].astype(np.int64)
45
    df["ht"] = df["ht"].astype(np.float64)
46
    return df
47
48
## Fit a OLS curve to the group patient's y as a function of x
49
def fit(group_patient, header_y, header_x, alpha = 0.01):
50
    
51
    x = group_patient[header_x]
52
    y = np.log(group_patient[header_y])
53
54
    ## Count number of datapoints for this patient
55
    num_x = group_patient.shape[0]
56
    
57
    ## np.ones necessary to include constant in regression
58
    X = np.column_stack((x*x*x, x*x, x, np.ones(num_x)))
59
60
    ## OLS fit
61
    model = sm.OLS(y,X)
62
    results = model.fit()
63
64
    ## Option 1: Outliers defined to be either datapoints with large residuals
65
    infl = results.get_influence()
66
    resids = infl.resid_studentized_external
67
    ## Calculate residual cutoff
68
    resid_cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
69
    large_resid = np.abs(resids) > resid_cutoff
70
71
    ## Option 2: Outliers defined to be either datapoints with cooks distance greater than 4/N
72
    #(c, p) = infl.cooks_distance
73
    #cooks_cutoff = 4.0/group_patient.shape[0]
74
    #large_cooks = c > cooks_cutoff
75
76
    ## Option 3: .. or both
77
    #outliers_bool = np.logical_or(outliers_bool_norm_resid, outliers_bool_cooks_distance)
78
79
    ## Currently choose option 3 for defining outliers
80
    return large_resid
81
82
## Fit a OLS curve to the group patient's y as a function of x
83
## Has more restrictive critera than fit(), and therefore better to use when
84
## patient's measurements do not fluctuate as much
85
def fit2(group_patient, header_y, header_x):
86
    
87
    x = group_patient[header_x]
88
    y = group_patient[header_y]
89
90
    ## Count number of datapoints for this patient
91
    num_x = group_patient.shape[0]
92
    
93
    ## np.ones necessary to include constant in regression
94
    X = np.column_stack((np.log(x), np.ones(num_x)))
95
    
96
    ## OLS fit
97
    model = sm.OLS(y,X)
98
    fitted = model.fit()
99
100
    ## Print summar of fit
101
    #print fitted.summary()
102
103
    ## Option 1: Outliers defined to be either datapoints with residuals greater than 2.5 standard deviations
104
    outliers_bool_norm_resid = abs(fitted.norm_resid()) > 2.5
105
106
    ## Option 2: Outliers defined to be either datapoints with cooks distance greater than 4/(N-K-1)
107
    #influence = fitted.get_influence()
108
    #(c, p) = influence.cooks_distance
109
    #outliers_bool_cooks_distance = c > 4.0/(num_x-2)
110
111
    ## Option 3: .. or both
112
    #outliers_bool = np.logical_or(outliers_bool_norm_resid, outliers_bool_cooks_distance)
113
114
    ## Currently choose option 3 for defining outliers
115
    return outliers_bool_norm_resid
116
117
#####################
118
119
## Read in document
120
df = pd.read_csv(csv_location)
121
122
## Drop columns not needed
123
df = df.drop(["Height (ft&inc)","Body Mass Index Percentile","Age on 5/1/2012","BMI","Patient Ethnicity","Patient Race"], axis=1)
124
df.rename(columns={"Patient ID": "id", "Gender": "gender", "Race/Ethnicity": "race_ethnicity", "Patient Age At Encounter": "age", \
125
                   "Body Mass Index": "bmi", "Weight (Ounces)":"wt_ounces", "Height (Inches)":"ht"}, inplace=True)
126
127
## Convert numerical columns to floats (and remove #VALUE! if necessary)
128
cond = df["ht"] == "#VALUE!"
129
df["ht"][cond] = "NaN"
130
df = change_dtypes(df)
131
132
## Fill in missing values: need at least one of bmi, wt, ht to resolve third
133
selector = np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"])
134
missing_series = df[selector]["wt_ounces"]/16/(np.power(df[selector]["ht"],2))*703
135
df["bmi"] = pd.concat([df["bmi"].dropna(), missing_series.dropna()]).reindex_like(df)
136
137
selector = ~np.isnan(df["bmi"]) & np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"])
138
missing_series = df[selector]["bmi"]*np.power(df[selector]["ht"],2)/703*16
139
df["wt"] = pd.concat([df["wt_ounces"].dropna(), missing_series.dropna()]).reindex_like(df)
140
141
selector = ~np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & np.isnan(df["ht"])
142
missing_series = np.power(df[selector]["wt_ounces"]/16/(df[selector]["bmi"])*703,0.5)
143
df["ht"] = pd.concat([df["ht"].dropna(), missing_series.dropna()]).reindex_like(df)
144
145
## Remove rows that can't be resolved for bmi, wt, or ht
146
selector = ~np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"]) & df["gender"].notnull()
147
df = df[selector]
148
149
## Convert from ounces to pounds
150
df["wt"] = df["wt_ounces"]/16
151
df = df.drop("wt_ounces", axis=1)
152
153
## Remove bad patients
154
for patient_id in bad_patients:
155
    df[df["id"] == patient_id] = np.nan
156
df = df.dropna()
157
158
## Remove datapoints where ht = 0
159
df = df[df["ht"] != 0]
160
    
161
## Sort columns by header id and reindex
162
df = df.sort(["id", "age"], ascending=[1,1])
163
df = df.reindex()
164
165
## Group by patient id
166
grouped_patients = df.groupby("id")
167
168
df_resampled_list = []
169
df_filtered_list = []
170
df_filtered_out_list = []
171
df_outliers_list = []
172
173
## Iterate through each patient in the dataframe
174
## Note: using pandas to append is O(n^2), whereas appending list of dictionaries is O(n)
175
## Takes ~24 mins
176
for name_patient, group_patient in grouped_patients:
177
    ## To grab patients, type:
178
    #grouped_patients.get_group(np.int64(5035))
179
180
    ## Skip patients that have less than minimum number of datapoints
181
    if group_patient.shape[0] < min_datapoints:
182
        group_patient.apply(lambda x: df_filtered_out_list.append(x.to_dict()), axis = 1)
183
        #df_filtered_out = pd.DataFrame.append(df_filtered_out, group_patient)
184
        continue
185
186
    ## Change datapoints with age 0 to age 0.01 (in order to take its log for the regression)
187
    group_patient["age"][group_patient["age"] == np.float64(0)] = np.float64(0.01)
188
189
    ## Conduct regression of ht vs age and wt vs age to determine outliers
190
    
191
    ## Use fit2() if patient only has data from 15 yrs old and on, as
192
    ## patient's measurements do not change much
193
    if group_patient["age"].min() > 15:
194
        outliers_bool_ht = fit2(group_patient, "ht", "age")
195
        outliers_bool_wt = fit2(group_patient, "wt", "age")
196
    ## Otherwise use fit()
197
    else:
198
        outliers_bool_ht = fit(group_patient, "ht", "age")
199
        outliers_bool_wt = fit(group_patient, "wt", "age")
200
    outliers_bool = np.logical_or(outliers_bool_ht, outliers_bool_wt)
201
202
    ## Remove outlier datapoints and save removed datapoints to df_outliers
203
    if any(outliers_bool):
204
        group_patient[outliers_bool].apply(lambda x: df_outliers_list.append(x.to_dict()), axis = 1)
205
    group_patient = group_patient[np.logical_not(outliers_bool)]
206
207
    ## Check again after removing outliers
208
    ## Skip patients that have less than minimum number of datapoints
209
    if group_patient.shape[0] < min_datapoints:
210
        group_patient.apply(lambda x: df_filtered_out_list.append(x.to_dict()), axis = 1)
211
        continue
212
    
213
    ## Interpolate ht and wt linearly and resample at defined age intervals
214
##    f_ht = interpolate.interp1d(group_patient["age"], np.log(group_patient["ht"]), kind='cubic', bounds_error=False)
215
##    f_wt = interpolate.interp1d(group_patient["age"], np.log(group_patient["wt"]), kind='cubic', bounds_error=False)
216
##    df_resampled_patient = pd.DataFrame(intervals, columns=["age"])
217
##    df_resampled_patient["ht"] = np.power(math.e, df_resampled_patient["age"].apply(f_ht))
218
##    df_resampled_patient["wt"] = np.power(math.e, df_resampled_patient["age"].apply(f_wt))
219
220
    f_ht = interpolate.interp1d(group_patient["age"], group_patient["ht"], kind='linear', bounds_error=False)
221
    f_wt = interpolate.interp1d(group_patient["age"], group_patient["wt"], kind='linear', bounds_error=False)
222
    df_resampled_patient = pd.DataFrame(intervals, columns=["age"])
223
    df_resampled_patient["ht"] = df_resampled_patient["age"].apply(f_ht)
224
    df_resampled_patient["wt"] = df_resampled_patient["age"].apply(f_wt)
225
226
    ## Drop NAs, which occur because no extrapolation is conducted
227
    df_resampled_patient = df_resampled_patient.dropna()
228
229
    ## Recalculate BMIs
230
    df_resampled_patient["bmi"] = df_resampled_patient["wt"]/(np.power(df_resampled_patient["ht"],2))*703
231
232
    ## Save patient attributes
233
    df_resampled_patient["id"] = name_patient
234
    df_resampled_patient["gender"] = group_patient["gender"].max()
235
    df_resampled_patient["race_ethnicity"] = group_patient["race_ethnicity"].max()
236
237
    ## Save filtered and resampled data
238
    df_resampled_patient.apply(lambda x: df_resampled_list.append(x.to_dict()), axis = 1)
239
    group_patient.apply(lambda x: df_filtered_list.append(x.to_dict()), axis = 1)
240
241
## Convert list of dictionaries to dataframes
242
df_resampled = change_dtypes(pd.DataFrame(df_resampled_list).dropna())
243
df_filtered = change_dtypes(pd.DataFrame(df_filtered_list).dropna())
244
df_filtered_out = change_dtypes(pd.DataFrame(df_filtered_out_list).dropna())
245
df_outliers = change_dtypes(pd.DataFrame(df_outliers_list).dropna())
246
247
## Print dataframes to CSVs
248
df_resampled.to_csv("../../data/csv/BMI_resampled.csv", index_label=False, index=False)
249
df_filtered.to_csv("../../data/csv/BMI_filtered.csv", index_label=False, index=False)
250
df_filtered_out.to_csv("../../data/csv/BMI_filtered_out.csv", index_label=False, index=False)
251
df_outliers.to_csv("../../data/csv/BMI_outliers.csv", index_label=False, index=False)
252
253
## Dump files with pickle using the highest protocol available
254
output = open('../../data/pkl/BMI_resampled.pkl', 'wb')
255
pickle.dump(df_resampled, output, -1)
256
output.close()
257
output = open('../../data/pkl/BMI_filtered.pkl', 'wb')
258
pickle.dump(df_filtered, output, -1)
259
output.close()
260
output = open('../../data/pkl/BMI_filtered_out.pkl', 'wb')
261
pickle.dump(df_filtered_out, output, -1)
262
output.close()
263
output = open('../../data/pkl/BMI_outliers.pkl', 'wb')
264
pickle.dump(df_outliers, output, -1)
265
output.close()