[d8a979]: / code / init_processing / init_processing.py

Download this file

266 lines (206 with data), 11.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
##### SETUP ######
import pickle
import numpy as np
import pandas as pd
import math
import scipy.interpolate as interpolate
import matplotlib.pyplot as plt
import pylab as P
import numpy as np
import statsmodels.api as sm
from scipy import stats
## Set up age intervals
## Float instability created errors, so corrected the following trunction
intervals = np.trunc(np.concatenate((np.array([0.01]),np.arange(.05,.2,.05),np.arange(.4,2,.2), np.arange(2,20,.5)))*100)/100
##################
##### VARIABLES #####
## Minimum number of datapoints for patient to pass filter
min_datapoints = 5
## CSV raw document locaiton
csv_location = "../../data/csv/BMI.csv"
## Individuals with outlier data
## 16262: First two datapoints when ages ~1.8 have heights of ~2 inches
bad_patients = [16262]
#####################
##### FUNCTIONS #####
## Change numeric datatypes to appropriate floats of ints
def change_dtypes(df):
if "wt" in df.columns:
df["wt"] = df["wt"].astype(np.float64)
else:
df["wt_ounces"] = df["wt_ounces"].astype(np.float64)
df["age"] = df["age"].astype(np.float64)
df["bmi"] = df["bmi"].astype(np.float64)
df["id"] = df["id"].astype(np.int64)
df["ht"] = df["ht"].astype(np.float64)
return df
## Fit a OLS curve to the group patient's y as a function of x
def fit(group_patient, header_y, header_x, alpha = 0.01):
x = group_patient[header_x]
y = np.log(group_patient[header_y])
## Count number of datapoints for this patient
num_x = group_patient.shape[0]
## np.ones necessary to include constant in regression
X = np.column_stack((x*x*x, x*x, x, np.ones(num_x)))
## OLS fit
model = sm.OLS(y,X)
results = model.fit()
## Option 1: Outliers defined to be either datapoints with large residuals
infl = results.get_influence()
resids = infl.resid_studentized_external
## Calculate residual cutoff
resid_cutoff = stats.t.ppf(1.-alpha/2, results.df_resid)
large_resid = np.abs(resids) > resid_cutoff
## Option 2: Outliers defined to be either datapoints with cooks distance greater than 4/N
#(c, p) = infl.cooks_distance
#cooks_cutoff = 4.0/group_patient.shape[0]
#large_cooks = c > cooks_cutoff
## Option 3: .. or both
#outliers_bool = np.logical_or(outliers_bool_norm_resid, outliers_bool_cooks_distance)
## Currently choose option 3 for defining outliers
return large_resid
## Fit a OLS curve to the group patient's y as a function of x
## Has more restrictive critera than fit(), and therefore better to use when
## patient's measurements do not fluctuate as much
def fit2(group_patient, header_y, header_x):
x = group_patient[header_x]
y = group_patient[header_y]
## Count number of datapoints for this patient
num_x = group_patient.shape[0]
## np.ones necessary to include constant in regression
X = np.column_stack((np.log(x), np.ones(num_x)))
## OLS fit
model = sm.OLS(y,X)
fitted = model.fit()
## Print summar of fit
#print fitted.summary()
## Option 1: Outliers defined to be either datapoints with residuals greater than 2.5 standard deviations
outliers_bool_norm_resid = abs(fitted.norm_resid()) > 2.5
## Option 2: Outliers defined to be either datapoints with cooks distance greater than 4/(N-K-1)
#influence = fitted.get_influence()
#(c, p) = influence.cooks_distance
#outliers_bool_cooks_distance = c > 4.0/(num_x-2)
## Option 3: .. or both
#outliers_bool = np.logical_or(outliers_bool_norm_resid, outliers_bool_cooks_distance)
## Currently choose option 3 for defining outliers
return outliers_bool_norm_resid
#####################
## Read in document
df = pd.read_csv(csv_location)
## Drop columns not needed
df = df.drop(["Height (ft&inc)","Body Mass Index Percentile","Age on 5/1/2012","BMI","Patient Ethnicity","Patient Race"], axis=1)
df.rename(columns={"Patient ID": "id", "Gender": "gender", "Race/Ethnicity": "race_ethnicity", "Patient Age At Encounter": "age", \
"Body Mass Index": "bmi", "Weight (Ounces)":"wt_ounces", "Height (Inches)":"ht"}, inplace=True)
## Convert numerical columns to floats (and remove #VALUE! if necessary)
cond = df["ht"] == "#VALUE!"
df["ht"][cond] = "NaN"
df = change_dtypes(df)
## Fill in missing values: need at least one of bmi, wt, ht to resolve third
selector = np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"])
missing_series = df[selector]["wt_ounces"]/16/(np.power(df[selector]["ht"],2))*703
df["bmi"] = pd.concat([df["bmi"].dropna(), missing_series.dropna()]).reindex_like(df)
selector = ~np.isnan(df["bmi"]) & np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"])
missing_series = df[selector]["bmi"]*np.power(df[selector]["ht"],2)/703*16
df["wt"] = pd.concat([df["wt_ounces"].dropna(), missing_series.dropna()]).reindex_like(df)
selector = ~np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & np.isnan(df["ht"])
missing_series = np.power(df[selector]["wt_ounces"]/16/(df[selector]["bmi"])*703,0.5)
df["ht"] = pd.concat([df["ht"].dropna(), missing_series.dropna()]).reindex_like(df)
## Remove rows that can't be resolved for bmi, wt, or ht
selector = ~np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"]) & df["gender"].notnull()
df = df[selector]
## Convert from ounces to pounds
df["wt"] = df["wt_ounces"]/16
df = df.drop("wt_ounces", axis=1)
## Remove bad patients
for patient_id in bad_patients:
df[df["id"] == patient_id] = np.nan
df = df.dropna()
## Remove datapoints where ht = 0
df = df[df["ht"] != 0]
## Sort columns by header id and reindex
df = df.sort(["id", "age"], ascending=[1,1])
df = df.reindex()
## Group by patient id
grouped_patients = df.groupby("id")
df_resampled_list = []
df_filtered_list = []
df_filtered_out_list = []
df_outliers_list = []
## Iterate through each patient in the dataframe
## Note: using pandas to append is O(n^2), whereas appending list of dictionaries is O(n)
## Takes ~24 mins
for name_patient, group_patient in grouped_patients:
## To grab patients, type:
#grouped_patients.get_group(np.int64(5035))
## Skip patients that have less than minimum number of datapoints
if group_patient.shape[0] < min_datapoints:
group_patient.apply(lambda x: df_filtered_out_list.append(x.to_dict()), axis = 1)
#df_filtered_out = pd.DataFrame.append(df_filtered_out, group_patient)
continue
## Change datapoints with age 0 to age 0.01 (in order to take its log for the regression)
group_patient["age"][group_patient["age"] == np.float64(0)] = np.float64(0.01)
## Conduct regression of ht vs age and wt vs age to determine outliers
## Use fit2() if patient only has data from 15 yrs old and on, as
## patient's measurements do not change much
if group_patient["age"].min() > 15:
outliers_bool_ht = fit2(group_patient, "ht", "age")
outliers_bool_wt = fit2(group_patient, "wt", "age")
## Otherwise use fit()
else:
outliers_bool_ht = fit(group_patient, "ht", "age")
outliers_bool_wt = fit(group_patient, "wt", "age")
outliers_bool = np.logical_or(outliers_bool_ht, outliers_bool_wt)
## Remove outlier datapoints and save removed datapoints to df_outliers
if any(outliers_bool):
group_patient[outliers_bool].apply(lambda x: df_outliers_list.append(x.to_dict()), axis = 1)
group_patient = group_patient[np.logical_not(outliers_bool)]
## Check again after removing outliers
## Skip patients that have less than minimum number of datapoints
if group_patient.shape[0] < min_datapoints:
group_patient.apply(lambda x: df_filtered_out_list.append(x.to_dict()), axis = 1)
continue
## Interpolate ht and wt linearly and resample at defined age intervals
## f_ht = interpolate.interp1d(group_patient["age"], np.log(group_patient["ht"]), kind='cubic', bounds_error=False)
## f_wt = interpolate.interp1d(group_patient["age"], np.log(group_patient["wt"]), kind='cubic', bounds_error=False)
## df_resampled_patient = pd.DataFrame(intervals, columns=["age"])
## df_resampled_patient["ht"] = np.power(math.e, df_resampled_patient["age"].apply(f_ht))
## df_resampled_patient["wt"] = np.power(math.e, df_resampled_patient["age"].apply(f_wt))
f_ht = interpolate.interp1d(group_patient["age"], group_patient["ht"], kind='linear', bounds_error=False)
f_wt = interpolate.interp1d(group_patient["age"], group_patient["wt"], kind='linear', bounds_error=False)
df_resampled_patient = pd.DataFrame(intervals, columns=["age"])
df_resampled_patient["ht"] = df_resampled_patient["age"].apply(f_ht)
df_resampled_patient["wt"] = df_resampled_patient["age"].apply(f_wt)
## Drop NAs, which occur because no extrapolation is conducted
df_resampled_patient = df_resampled_patient.dropna()
## Recalculate BMIs
df_resampled_patient["bmi"] = df_resampled_patient["wt"]/(np.power(df_resampled_patient["ht"],2))*703
## Save patient attributes
df_resampled_patient["id"] = name_patient
df_resampled_patient["gender"] = group_patient["gender"].max()
df_resampled_patient["race_ethnicity"] = group_patient["race_ethnicity"].max()
## Save filtered and resampled data
df_resampled_patient.apply(lambda x: df_resampled_list.append(x.to_dict()), axis = 1)
group_patient.apply(lambda x: df_filtered_list.append(x.to_dict()), axis = 1)
## Convert list of dictionaries to dataframes
df_resampled = change_dtypes(pd.DataFrame(df_resampled_list).dropna())
df_filtered = change_dtypes(pd.DataFrame(df_filtered_list).dropna())
df_filtered_out = change_dtypes(pd.DataFrame(df_filtered_out_list).dropna())
df_outliers = change_dtypes(pd.DataFrame(df_outliers_list).dropna())
## Print dataframes to CSVs
df_resampled.to_csv("../../data/csv/BMI_resampled.csv", index_label=False, index=False)
df_filtered.to_csv("../../data/csv/BMI_filtered.csv", index_label=False, index=False)
df_filtered_out.to_csv("../../data/csv/BMI_filtered_out.csv", index_label=False, index=False)
df_outliers.to_csv("../../data/csv/BMI_outliers.csv", index_label=False, index=False)
## Dump files with pickle using the highest protocol available
output = open('../../data/pkl/BMI_resampled.pkl', 'wb')
pickle.dump(df_resampled, output, -1)
output.close()
output = open('../../data/pkl/BMI_filtered.pkl', 'wb')
pickle.dump(df_filtered, output, -1)
output.close()
output = open('../../data/pkl/BMI_filtered_out.pkl', 'wb')
pickle.dump(df_filtered_out, output, -1)
output.close()
output = open('../../data/pkl/BMI_outliers.pkl', 'wb')
pickle.dump(df_outliers, output, -1)
output.close()