|
a |
|
b/code/init_processing/init_processing.py |
|
|
1 |
##### SETUP ###### |
|
|
2 |
|
|
|
3 |
import pickle |
|
|
4 |
import numpy as np |
|
|
5 |
import pandas as pd |
|
|
6 |
import math |
|
|
7 |
import scipy.interpolate as interpolate |
|
|
8 |
import matplotlib.pyplot as plt |
|
|
9 |
import pylab as P |
|
|
10 |
import numpy as np |
|
|
11 |
import statsmodels.api as sm |
|
|
12 |
from scipy import stats |
|
|
13 |
|
|
|
14 |
## Set up age intervals |
|
|
15 |
## Float instability created errors, so corrected the following trunction |
|
|
16 |
intervals = np.trunc(np.concatenate((np.array([0.01]),np.arange(.05,.2,.05),np.arange(.4,2,.2), np.arange(2,20,.5)))*100)/100 |
|
|
17 |
|
|
|
18 |
################## |
|
|
19 |
|
|
|
20 |
##### VARIABLES ##### |
|
|
21 |
|
|
|
22 |
## Minimum number of datapoints for patient to pass filter |
|
|
23 |
min_datapoints = 5 |
|
|
24 |
|
|
|
25 |
## CSV raw document locaiton |
|
|
26 |
csv_location = "../../data/csv/BMI.csv" |
|
|
27 |
|
|
|
28 |
## Individuals with outlier data |
|
|
29 |
## 16262: First two datapoints when ages ~1.8 have heights of ~2 inches |
|
|
30 |
bad_patients = [16262] |
|
|
31 |
|
|
|
32 |
##################### |
|
|
33 |
|
|
|
34 |
##### FUNCTIONS ##### |
|
|
35 |
|
|
|
36 |
## Change numeric datatypes to appropriate floats of ints |
|
|
37 |
def change_dtypes(df): |
|
|
38 |
if "wt" in df.columns: |
|
|
39 |
df["wt"] = df["wt"].astype(np.float64) |
|
|
40 |
else: |
|
|
41 |
df["wt_ounces"] = df["wt_ounces"].astype(np.float64) |
|
|
42 |
df["age"] = df["age"].astype(np.float64) |
|
|
43 |
df["bmi"] = df["bmi"].astype(np.float64) |
|
|
44 |
df["id"] = df["id"].astype(np.int64) |
|
|
45 |
df["ht"] = df["ht"].astype(np.float64) |
|
|
46 |
return df |
|
|
47 |
|
|
|
48 |
## Fit a OLS curve to the group patient's y as a function of x |
|
|
49 |
def fit(group_patient, header_y, header_x, alpha = 0.01): |
|
|
50 |
|
|
|
51 |
x = group_patient[header_x] |
|
|
52 |
y = np.log(group_patient[header_y]) |
|
|
53 |
|
|
|
54 |
## Count number of datapoints for this patient |
|
|
55 |
num_x = group_patient.shape[0] |
|
|
56 |
|
|
|
57 |
## np.ones necessary to include constant in regression |
|
|
58 |
X = np.column_stack((x*x*x, x*x, x, np.ones(num_x))) |
|
|
59 |
|
|
|
60 |
## OLS fit |
|
|
61 |
model = sm.OLS(y,X) |
|
|
62 |
results = model.fit() |
|
|
63 |
|
|
|
64 |
## Option 1: Outliers defined to be either datapoints with large residuals |
|
|
65 |
infl = results.get_influence() |
|
|
66 |
resids = infl.resid_studentized_external |
|
|
67 |
## Calculate residual cutoff |
|
|
68 |
resid_cutoff = stats.t.ppf(1.-alpha/2, results.df_resid) |
|
|
69 |
large_resid = np.abs(resids) > resid_cutoff |
|
|
70 |
|
|
|
71 |
## Option 2: Outliers defined to be either datapoints with cooks distance greater than 4/N |
|
|
72 |
#(c, p) = infl.cooks_distance |
|
|
73 |
#cooks_cutoff = 4.0/group_patient.shape[0] |
|
|
74 |
#large_cooks = c > cooks_cutoff |
|
|
75 |
|
|
|
76 |
## Option 3: .. or both |
|
|
77 |
#outliers_bool = np.logical_or(outliers_bool_norm_resid, outliers_bool_cooks_distance) |
|
|
78 |
|
|
|
79 |
## Currently choose option 3 for defining outliers |
|
|
80 |
return large_resid |
|
|
81 |
|
|
|
82 |
## Fit a OLS curve to the group patient's y as a function of x |
|
|
83 |
## Has more restrictive critera than fit(), and therefore better to use when |
|
|
84 |
## patient's measurements do not fluctuate as much |
|
|
85 |
def fit2(group_patient, header_y, header_x): |
|
|
86 |
|
|
|
87 |
x = group_patient[header_x] |
|
|
88 |
y = group_patient[header_y] |
|
|
89 |
|
|
|
90 |
## Count number of datapoints for this patient |
|
|
91 |
num_x = group_patient.shape[0] |
|
|
92 |
|
|
|
93 |
## np.ones necessary to include constant in regression |
|
|
94 |
X = np.column_stack((np.log(x), np.ones(num_x))) |
|
|
95 |
|
|
|
96 |
## OLS fit |
|
|
97 |
model = sm.OLS(y,X) |
|
|
98 |
fitted = model.fit() |
|
|
99 |
|
|
|
100 |
## Print summar of fit |
|
|
101 |
#print fitted.summary() |
|
|
102 |
|
|
|
103 |
## Option 1: Outliers defined to be either datapoints with residuals greater than 2.5 standard deviations |
|
|
104 |
outliers_bool_norm_resid = abs(fitted.norm_resid()) > 2.5 |
|
|
105 |
|
|
|
106 |
## Option 2: Outliers defined to be either datapoints with cooks distance greater than 4/(N-K-1) |
|
|
107 |
#influence = fitted.get_influence() |
|
|
108 |
#(c, p) = influence.cooks_distance |
|
|
109 |
#outliers_bool_cooks_distance = c > 4.0/(num_x-2) |
|
|
110 |
|
|
|
111 |
## Option 3: .. or both |
|
|
112 |
#outliers_bool = np.logical_or(outliers_bool_norm_resid, outliers_bool_cooks_distance) |
|
|
113 |
|
|
|
114 |
## Currently choose option 3 for defining outliers |
|
|
115 |
return outliers_bool_norm_resid |
|
|
116 |
|
|
|
117 |
##################### |
|
|
118 |
|
|
|
119 |
## Read in document |
|
|
120 |
df = pd.read_csv(csv_location) |
|
|
121 |
|
|
|
122 |
## Drop columns not needed |
|
|
123 |
df = df.drop(["Height (ft&inc)","Body Mass Index Percentile","Age on 5/1/2012","BMI","Patient Ethnicity","Patient Race"], axis=1) |
|
|
124 |
df.rename(columns={"Patient ID": "id", "Gender": "gender", "Race/Ethnicity": "race_ethnicity", "Patient Age At Encounter": "age", \ |
|
|
125 |
"Body Mass Index": "bmi", "Weight (Ounces)":"wt_ounces", "Height (Inches)":"ht"}, inplace=True) |
|
|
126 |
|
|
|
127 |
## Convert numerical columns to floats (and remove #VALUE! if necessary) |
|
|
128 |
cond = df["ht"] == "#VALUE!" |
|
|
129 |
df["ht"][cond] = "NaN" |
|
|
130 |
df = change_dtypes(df) |
|
|
131 |
|
|
|
132 |
## Fill in missing values: need at least one of bmi, wt, ht to resolve third |
|
|
133 |
selector = np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"]) |
|
|
134 |
missing_series = df[selector]["wt_ounces"]/16/(np.power(df[selector]["ht"],2))*703 |
|
|
135 |
df["bmi"] = pd.concat([df["bmi"].dropna(), missing_series.dropna()]).reindex_like(df) |
|
|
136 |
|
|
|
137 |
selector = ~np.isnan(df["bmi"]) & np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"]) |
|
|
138 |
missing_series = df[selector]["bmi"]*np.power(df[selector]["ht"],2)/703*16 |
|
|
139 |
df["wt"] = pd.concat([df["wt_ounces"].dropna(), missing_series.dropna()]).reindex_like(df) |
|
|
140 |
|
|
|
141 |
selector = ~np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & np.isnan(df["ht"]) |
|
|
142 |
missing_series = np.power(df[selector]["wt_ounces"]/16/(df[selector]["bmi"])*703,0.5) |
|
|
143 |
df["ht"] = pd.concat([df["ht"].dropna(), missing_series.dropna()]).reindex_like(df) |
|
|
144 |
|
|
|
145 |
## Remove rows that can't be resolved for bmi, wt, or ht |
|
|
146 |
selector = ~np.isnan(df["bmi"]) & ~np.isnan(df["wt_ounces"]) & ~np.isnan(df["ht"]) & df["gender"].notnull() |
|
|
147 |
df = df[selector] |
|
|
148 |
|
|
|
149 |
## Convert from ounces to pounds |
|
|
150 |
df["wt"] = df["wt_ounces"]/16 |
|
|
151 |
df = df.drop("wt_ounces", axis=1) |
|
|
152 |
|
|
|
153 |
## Remove bad patients |
|
|
154 |
for patient_id in bad_patients: |
|
|
155 |
df[df["id"] == patient_id] = np.nan |
|
|
156 |
df = df.dropna() |
|
|
157 |
|
|
|
158 |
## Remove datapoints where ht = 0 |
|
|
159 |
df = df[df["ht"] != 0] |
|
|
160 |
|
|
|
161 |
## Sort columns by header id and reindex |
|
|
162 |
df = df.sort(["id", "age"], ascending=[1,1]) |
|
|
163 |
df = df.reindex() |
|
|
164 |
|
|
|
165 |
## Group by patient id |
|
|
166 |
grouped_patients = df.groupby("id") |
|
|
167 |
|
|
|
168 |
df_resampled_list = [] |
|
|
169 |
df_filtered_list = [] |
|
|
170 |
df_filtered_out_list = [] |
|
|
171 |
df_outliers_list = [] |
|
|
172 |
|
|
|
173 |
## Iterate through each patient in the dataframe |
|
|
174 |
## Note: using pandas to append is O(n^2), whereas appending list of dictionaries is O(n) |
|
|
175 |
## Takes ~24 mins |
|
|
176 |
for name_patient, group_patient in grouped_patients: |
|
|
177 |
## To grab patients, type: |
|
|
178 |
#grouped_patients.get_group(np.int64(5035)) |
|
|
179 |
|
|
|
180 |
## Skip patients that have less than minimum number of datapoints |
|
|
181 |
if group_patient.shape[0] < min_datapoints: |
|
|
182 |
group_patient.apply(lambda x: df_filtered_out_list.append(x.to_dict()), axis = 1) |
|
|
183 |
#df_filtered_out = pd.DataFrame.append(df_filtered_out, group_patient) |
|
|
184 |
continue |
|
|
185 |
|
|
|
186 |
## Change datapoints with age 0 to age 0.01 (in order to take its log for the regression) |
|
|
187 |
group_patient["age"][group_patient["age"] == np.float64(0)] = np.float64(0.01) |
|
|
188 |
|
|
|
189 |
## Conduct regression of ht vs age and wt vs age to determine outliers |
|
|
190 |
|
|
|
191 |
## Use fit2() if patient only has data from 15 yrs old and on, as |
|
|
192 |
## patient's measurements do not change much |
|
|
193 |
if group_patient["age"].min() > 15: |
|
|
194 |
outliers_bool_ht = fit2(group_patient, "ht", "age") |
|
|
195 |
outliers_bool_wt = fit2(group_patient, "wt", "age") |
|
|
196 |
## Otherwise use fit() |
|
|
197 |
else: |
|
|
198 |
outliers_bool_ht = fit(group_patient, "ht", "age") |
|
|
199 |
outliers_bool_wt = fit(group_patient, "wt", "age") |
|
|
200 |
outliers_bool = np.logical_or(outliers_bool_ht, outliers_bool_wt) |
|
|
201 |
|
|
|
202 |
## Remove outlier datapoints and save removed datapoints to df_outliers |
|
|
203 |
if any(outliers_bool): |
|
|
204 |
group_patient[outliers_bool].apply(lambda x: df_outliers_list.append(x.to_dict()), axis = 1) |
|
|
205 |
group_patient = group_patient[np.logical_not(outliers_bool)] |
|
|
206 |
|
|
|
207 |
## Check again after removing outliers |
|
|
208 |
## Skip patients that have less than minimum number of datapoints |
|
|
209 |
if group_patient.shape[0] < min_datapoints: |
|
|
210 |
group_patient.apply(lambda x: df_filtered_out_list.append(x.to_dict()), axis = 1) |
|
|
211 |
continue |
|
|
212 |
|
|
|
213 |
## Interpolate ht and wt linearly and resample at defined age intervals |
|
|
214 |
## f_ht = interpolate.interp1d(group_patient["age"], np.log(group_patient["ht"]), kind='cubic', bounds_error=False) |
|
|
215 |
## f_wt = interpolate.interp1d(group_patient["age"], np.log(group_patient["wt"]), kind='cubic', bounds_error=False) |
|
|
216 |
## df_resampled_patient = pd.DataFrame(intervals, columns=["age"]) |
|
|
217 |
## df_resampled_patient["ht"] = np.power(math.e, df_resampled_patient["age"].apply(f_ht)) |
|
|
218 |
## df_resampled_patient["wt"] = np.power(math.e, df_resampled_patient["age"].apply(f_wt)) |
|
|
219 |
|
|
|
220 |
f_ht = interpolate.interp1d(group_patient["age"], group_patient["ht"], kind='linear', bounds_error=False) |
|
|
221 |
f_wt = interpolate.interp1d(group_patient["age"], group_patient["wt"], kind='linear', bounds_error=False) |
|
|
222 |
df_resampled_patient = pd.DataFrame(intervals, columns=["age"]) |
|
|
223 |
df_resampled_patient["ht"] = df_resampled_patient["age"].apply(f_ht) |
|
|
224 |
df_resampled_patient["wt"] = df_resampled_patient["age"].apply(f_wt) |
|
|
225 |
|
|
|
226 |
## Drop NAs, which occur because no extrapolation is conducted |
|
|
227 |
df_resampled_patient = df_resampled_patient.dropna() |
|
|
228 |
|
|
|
229 |
## Recalculate BMIs |
|
|
230 |
df_resampled_patient["bmi"] = df_resampled_patient["wt"]/(np.power(df_resampled_patient["ht"],2))*703 |
|
|
231 |
|
|
|
232 |
## Save patient attributes |
|
|
233 |
df_resampled_patient["id"] = name_patient |
|
|
234 |
df_resampled_patient["gender"] = group_patient["gender"].max() |
|
|
235 |
df_resampled_patient["race_ethnicity"] = group_patient["race_ethnicity"].max() |
|
|
236 |
|
|
|
237 |
## Save filtered and resampled data |
|
|
238 |
df_resampled_patient.apply(lambda x: df_resampled_list.append(x.to_dict()), axis = 1) |
|
|
239 |
group_patient.apply(lambda x: df_filtered_list.append(x.to_dict()), axis = 1) |
|
|
240 |
|
|
|
241 |
## Convert list of dictionaries to dataframes |
|
|
242 |
df_resampled = change_dtypes(pd.DataFrame(df_resampled_list).dropna()) |
|
|
243 |
df_filtered = change_dtypes(pd.DataFrame(df_filtered_list).dropna()) |
|
|
244 |
df_filtered_out = change_dtypes(pd.DataFrame(df_filtered_out_list).dropna()) |
|
|
245 |
df_outliers = change_dtypes(pd.DataFrame(df_outliers_list).dropna()) |
|
|
246 |
|
|
|
247 |
## Print dataframes to CSVs |
|
|
248 |
df_resampled.to_csv("../../data/csv/BMI_resampled.csv", index_label=False, index=False) |
|
|
249 |
df_filtered.to_csv("../../data/csv/BMI_filtered.csv", index_label=False, index=False) |
|
|
250 |
df_filtered_out.to_csv("../../data/csv/BMI_filtered_out.csv", index_label=False, index=False) |
|
|
251 |
df_outliers.to_csv("../../data/csv/BMI_outliers.csv", index_label=False, index=False) |
|
|
252 |
|
|
|
253 |
## Dump files with pickle using the highest protocol available |
|
|
254 |
output = open('../../data/pkl/BMI_resampled.pkl', 'wb') |
|
|
255 |
pickle.dump(df_resampled, output, -1) |
|
|
256 |
output.close() |
|
|
257 |
output = open('../../data/pkl/BMI_filtered.pkl', 'wb') |
|
|
258 |
pickle.dump(df_filtered, output, -1) |
|
|
259 |
output.close() |
|
|
260 |
output = open('../../data/pkl/BMI_filtered_out.pkl', 'wb') |
|
|
261 |
pickle.dump(df_filtered_out, output, -1) |
|
|
262 |
output.close() |
|
|
263 |
output = open('../../data/pkl/BMI_outliers.pkl', 'wb') |
|
|
264 |
pickle.dump(df_outliers, output, -1) |
|
|
265 |
output.close() |