[0f1df3]: / datasets / preprocess / tools.py

Download this file

155 lines (126 with data), 6.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import math
import numpy as np
import pandas as pd
def calculate_data_existing_length(data):
res = 0
for i in data:
if not pd.isna(i):
res += 1
return res
# elements in data are sorted in time ascending order
def fill_missing_value(data, to_fill_value=0):
data_len = len(data)
data_exist_len = calculate_data_existing_length(data)
if data_len == data_exist_len:
return data
elif data_exist_len == 0:
# data = [to_fill_value for _ in range(data_len)]
for i in range(data_len):
data[i] = to_fill_value
return data
if pd.isna(data[0]):
# find the first non-nan value's position
not_na_pos = 0
for i in range(data_len):
if not pd.isna(data[i]):
not_na_pos = i
break
# fill element before the first non-nan value with median
for i in range(not_na_pos):
data[i] = to_fill_value
# fill element after the first non-nan value
for i in range(1, data_len):
if pd.isna(data[i]):
data[i] = data[i - 1]
return data
def forward_fill_pipeline(
df: pd.DataFrame,
default_fill: pd.DataFrame,
demographic_features: list[str],
labtest_features: list[str],
target_features: list[str],
require_impute_features: list[str],
):
grouped = df.groupby("PatientID")
all_x = []
all_y = []
all_pid = []
all_record_times = [] # List to store record times for each patient
all_missing_masks = []
for name, group in grouped:
sorted_group = group.sort_values(by=["RecordTime"], ascending=True)
patient_x = []
patient_y = []
patient_record_times = [] # List to store record times for the current patient
patient_missing_masks = pd.isna(sorted_group[labtest_features]).values.astype(int).tolist()
for f in require_impute_features:
# if the f is not in the default_fill, then default to -1
if f not in default_fill: # these are normally categorical features
to_fill_value = -1
else:
to_fill_value = default_fill[f]
# take median patient as the default to-fill missing value
fill_missing_value(sorted_group[f].values, to_fill_value)
for _, v in sorted_group.iterrows():
patient_record_times.append(v['RecordTime'])
target_values = []
for f in target_features:
target_values.append(v[f])
patient_y.append(target_values)
x = []
for f in demographic_features + labtest_features:
x.append(v[f])
patient_x.append(x)
all_x.append(patient_x)
all_y.append(patient_y)
all_pid.append(name)
all_record_times.append(patient_record_times)
all_missing_masks.append(patient_missing_masks)
return all_x, all_y, all_pid, all_record_times, all_missing_masks
# outlier processing
def filter_outlier(element):
if np.abs(float(element)) > 1e4:
return 0
else:
return element
def normalize_dataframe(train_df, val_df, test_df, normalize_features, require_norm_later=True):
# Calculate the quantiles
q_low = train_df[normalize_features].quantile(0.05)
q_high = train_df[normalize_features].quantile(0.95)
# Filter the DataFrame based on the quantiles
filtered_df = train_df[(train_df[normalize_features] > q_low) & (
train_df[normalize_features] < q_high)]
# Calculate the mean and standard deviation and median of the filtered data, also the default fill value
train_mean = filtered_df[normalize_features].mean()
train_std = filtered_df[normalize_features].std()
train_median = filtered_df[normalize_features].median()
# if certain feature's mean/std/median is NaN, then set it as 0. This feature will be filled with 0 in the following steps
train_mean = train_mean.fillna(0)
train_std = train_std.fillna(0)
train_median = train_median.fillna(0)
if require_norm_later:
default_fill: pd.DataFrame = (train_median-train_mean)/(train_std+1e-12)
# LOS info
los_info = {"los_mean": train_mean["LOS"].item(
), "los_std": train_std["LOS"].item(), "los_median": train_median["LOS"].item()}
# Calculate large los and threshold (optional, designed for covid-19 benchmark)
los_array = train_df.groupby('PatientID')['LOS'].max().values
los_p95 = np.percentile(los_array, 95)
los_p5 = np.percentile(los_array, 5)
filtered_los = los_array[(los_array >= los_p5) & (los_array <= los_p95)]
los_info.update({"large_los": los_p95.item(), "threshold": filtered_los.mean().item()*0.5})
# Z-score normalize the train, val, and test sets with train_mean and train_std
train_df.loc[:, normalize_features] = (train_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
val_df.loc[:, normalize_features] = (val_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
test_df.loc[:, normalize_features] = (test_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
train_df.loc[:, normalize_features] = train_df.loc[:, normalize_features].map(filter_outlier)
val_df.loc[:, normalize_features] = val_df.loc[:, normalize_features].map(filter_outlier)
test_df.loc[:, normalize_features] = test_df.loc[:, normalize_features].map(filter_outlier)
return train_df, val_df, test_df, default_fill, los_info, train_mean, train_std
else:
default_fill: pd.DataFrame = train_median
return default_fill
def normalize_df_with_statistics(df, normalize_features, train_mean, train_std):
df.loc[:, normalize_features] = (df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
df.loc[:, normalize_features] = df.loc[:, normalize_features].map(filter_outlier)
return df