|
a |
|
b/datasets/preprocess/tools.py |
|
|
1 |
import math |
|
|
2 |
|
|
|
3 |
import numpy as np |
|
|
4 |
import pandas as pd |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
def calculate_data_existing_length(data): |
|
|
8 |
res = 0 |
|
|
9 |
for i in data: |
|
|
10 |
if not pd.isna(i): |
|
|
11 |
res += 1 |
|
|
12 |
return res |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
# elements in data are sorted in time ascending order |
|
|
16 |
def fill_missing_value(data, to_fill_value=0): |
|
|
17 |
data_len = len(data) |
|
|
18 |
data_exist_len = calculate_data_existing_length(data) |
|
|
19 |
if data_len == data_exist_len: |
|
|
20 |
return data |
|
|
21 |
elif data_exist_len == 0: |
|
|
22 |
# data = [to_fill_value for _ in range(data_len)] |
|
|
23 |
for i in range(data_len): |
|
|
24 |
data[i] = to_fill_value |
|
|
25 |
return data |
|
|
26 |
if pd.isna(data[0]): |
|
|
27 |
# find the first non-nan value's position |
|
|
28 |
not_na_pos = 0 |
|
|
29 |
for i in range(data_len): |
|
|
30 |
if not pd.isna(data[i]): |
|
|
31 |
not_na_pos = i |
|
|
32 |
break |
|
|
33 |
# fill element before the first non-nan value with median |
|
|
34 |
for i in range(not_na_pos): |
|
|
35 |
data[i] = to_fill_value |
|
|
36 |
# fill element after the first non-nan value |
|
|
37 |
for i in range(1, data_len): |
|
|
38 |
if pd.isna(data[i]): |
|
|
39 |
data[i] = data[i - 1] |
|
|
40 |
return data |
|
|
41 |
|
|
|
42 |
|
|
|
43 |
def forward_fill_pipeline( |
|
|
44 |
df: pd.DataFrame, |
|
|
45 |
default_fill: pd.DataFrame, |
|
|
46 |
demographic_features: list[str], |
|
|
47 |
labtest_features: list[str], |
|
|
48 |
target_features: list[str], |
|
|
49 |
require_impute_features: list[str], |
|
|
50 |
): |
|
|
51 |
grouped = df.groupby("PatientID") |
|
|
52 |
|
|
|
53 |
all_x = [] |
|
|
54 |
all_y = [] |
|
|
55 |
all_pid = [] |
|
|
56 |
all_record_times = [] # List to store record times for each patient |
|
|
57 |
all_missing_masks = [] |
|
|
58 |
|
|
|
59 |
|
|
|
60 |
for name, group in grouped: |
|
|
61 |
sorted_group = group.sort_values(by=["RecordTime"], ascending=True) |
|
|
62 |
patient_x = [] |
|
|
63 |
patient_y = [] |
|
|
64 |
patient_record_times = [] # List to store record times for the current patient |
|
|
65 |
patient_missing_masks = pd.isna(sorted_group[labtest_features]).values.astype(int).tolist() |
|
|
66 |
|
|
|
67 |
for f in require_impute_features: |
|
|
68 |
# if the f is not in the default_fill, then default to -1 |
|
|
69 |
if f not in default_fill: # these are normally categorical features |
|
|
70 |
to_fill_value = -1 |
|
|
71 |
else: |
|
|
72 |
to_fill_value = default_fill[f] |
|
|
73 |
# take median patient as the default to-fill missing value |
|
|
74 |
fill_missing_value(sorted_group[f].values, to_fill_value) |
|
|
75 |
|
|
|
76 |
for _, v in sorted_group.iterrows(): |
|
|
77 |
patient_record_times.append(v['RecordTime']) |
|
|
78 |
|
|
|
79 |
target_values = [] |
|
|
80 |
for f in target_features: |
|
|
81 |
target_values.append(v[f]) |
|
|
82 |
patient_y.append(target_values) |
|
|
83 |
x = [] |
|
|
84 |
for f in demographic_features + labtest_features: |
|
|
85 |
x.append(v[f]) |
|
|
86 |
patient_x.append(x) |
|
|
87 |
all_x.append(patient_x) |
|
|
88 |
all_y.append(patient_y) |
|
|
89 |
all_pid.append(name) |
|
|
90 |
all_record_times.append(patient_record_times) |
|
|
91 |
all_missing_masks.append(patient_missing_masks) |
|
|
92 |
return all_x, all_y, all_pid, all_record_times, all_missing_masks |
|
|
93 |
|
|
|
94 |
|
|
|
95 |
# outlier processing |
|
|
96 |
def filter_outlier(element): |
|
|
97 |
if np.abs(float(element)) > 1e4: |
|
|
98 |
return 0 |
|
|
99 |
else: |
|
|
100 |
return element |
|
|
101 |
|
|
|
102 |
def normalize_dataframe(train_df, val_df, test_df, normalize_features, require_norm_later=True): |
|
|
103 |
# Calculate the quantiles |
|
|
104 |
q_low = train_df[normalize_features].quantile(0.05) |
|
|
105 |
q_high = train_df[normalize_features].quantile(0.95) |
|
|
106 |
|
|
|
107 |
# Filter the DataFrame based on the quantiles |
|
|
108 |
filtered_df = train_df[(train_df[normalize_features] > q_low) & ( |
|
|
109 |
train_df[normalize_features] < q_high)] |
|
|
110 |
|
|
|
111 |
# Calculate the mean and standard deviation and median of the filtered data, also the default fill value |
|
|
112 |
train_mean = filtered_df[normalize_features].mean() |
|
|
113 |
train_std = filtered_df[normalize_features].std() |
|
|
114 |
train_median = filtered_df[normalize_features].median() |
|
|
115 |
|
|
|
116 |
# if certain feature's mean/std/median is NaN, then set it as 0. This feature will be filled with 0 in the following steps |
|
|
117 |
train_mean = train_mean.fillna(0) |
|
|
118 |
train_std = train_std.fillna(0) |
|
|
119 |
train_median = train_median.fillna(0) |
|
|
120 |
|
|
|
121 |
if require_norm_later: |
|
|
122 |
default_fill: pd.DataFrame = (train_median-train_mean)/(train_std+1e-12) |
|
|
123 |
# LOS info |
|
|
124 |
los_info = {"los_mean": train_mean["LOS"].item( |
|
|
125 |
), "los_std": train_std["LOS"].item(), "los_median": train_median["LOS"].item()} |
|
|
126 |
|
|
|
127 |
# Calculate large los and threshold (optional, designed for covid-19 benchmark) |
|
|
128 |
los_array = train_df.groupby('PatientID')['LOS'].max().values |
|
|
129 |
los_p95 = np.percentile(los_array, 95) |
|
|
130 |
los_p5 = np.percentile(los_array, 5) |
|
|
131 |
filtered_los = los_array[(los_array >= los_p5) & (los_array <= los_p95)] |
|
|
132 |
los_info.update({"large_los": los_p95.item(), "threshold": filtered_los.mean().item()*0.5}) |
|
|
133 |
|
|
|
134 |
|
|
|
135 |
# Z-score normalize the train, val, and test sets with train_mean and train_std |
|
|
136 |
train_df.loc[:, normalize_features] = (train_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12) |
|
|
137 |
val_df.loc[:, normalize_features] = (val_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12) |
|
|
138 |
test_df.loc[:, normalize_features] = (test_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12) |
|
|
139 |
|
|
|
140 |
train_df.loc[:, normalize_features] = train_df.loc[:, normalize_features].map(filter_outlier) |
|
|
141 |
val_df.loc[:, normalize_features] = val_df.loc[:, normalize_features].map(filter_outlier) |
|
|
142 |
test_df.loc[:, normalize_features] = test_df.loc[:, normalize_features].map(filter_outlier) |
|
|
143 |
|
|
|
144 |
return train_df, val_df, test_df, default_fill, los_info, train_mean, train_std |
|
|
145 |
|
|
|
146 |
else: |
|
|
147 |
default_fill: pd.DataFrame = train_median |
|
|
148 |
return default_fill |
|
|
149 |
|
|
|
150 |
def normalize_df_with_statistics(df, normalize_features, train_mean, train_std): |
|
|
151 |
df.loc[:, normalize_features] = (df.loc[:, normalize_features] - train_mean) / (train_std+1e-12) |
|
|
152 |
df.loc[:, normalize_features] = df.loc[:, normalize_features].map(filter_outlier) |
|
|
153 |
return df |
|
|
154 |
|