Download this file

128 lines (105 with data), 4.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
import pandas as pd
import numpy as np
import math
import copy
def calculate_data_existing_length(data):
res = 0
for i in data:
if not pd.isna(i):
res += 1
return res
# elements in data are sorted in time ascending order
def fill_missing_value(data, to_fill_value=0):
data_len = len(data)
data_exist_len = calculate_data_existing_length(data)
if data_len == data_exist_len:
return data
elif data_exist_len == 0:
# data = [to_fill_value for _ in range(data_len)]
for i in range(data_len):
data[i] = to_fill_value
return data
if pd.isna(data[0]):
# find the first non-nan value's position
not_na_pos = 0
for i in range(data_len):
if not pd.isna(data[i]):
not_na_pos = i
break
# fill element before the first non-nan value with median
for i in range(not_na_pos):
data[i] = to_fill_value
# fill element after the first non-nan value
for i in range(1, data_len):
if pd.isna(data[i]):
data[i] = data[i - 1]
return data
def forward_fill_pipeline(
df: pd.DataFrame,
default_fill: pd.DataFrame,
demographic_features: list[str],
labtest_features: list[str],
):
grouped = df.groupby("PatientID")
all_x = []
all_y = []
all_pid = []
for name, group in grouped:
sorted_group = group.sort_values(by=["RecordTime"], ascending=True)
patient_x = []
patient_y = []
for f in ["Age"] + labtest_features:
to_fill_value = default_fill[f]
# take median patient as the default to-fill missing value
fill_missing_value(sorted_group[f].values, to_fill_value)
for _, v in sorted_group.iterrows():
patient_y.append([v["Outcome"], v["LOS"]])
x = []
for f in demographic_features + labtest_features:
x.append(v[f])
patient_x.append(x)
all_x.append(patient_x)
all_y.append(patient_y)
all_pid.append(name)
return all_x, all_y, all_pid
# outlier processing
def filter_outlier(element):
if pd.isna(element):
return 0
elif np.abs(float(element)) > 1e4:
return 0
else:
return element
def normalize_dataframe(train_df, val_df, test_df, normalize_features):
# Calculate the quantiles
q_low = train_df[normalize_features].quantile(0.05)
q_high = train_df[normalize_features].quantile(0.95)
# Filter the DataFrame based on the quantiles
filtered_df = train_df[(train_df[normalize_features] > q_low) & (
train_df[normalize_features] < q_high)]
# Calculate the mean and standard deviation and median of the filtered data, also the default fill value
train_mean = filtered_df[normalize_features].mean()
train_std = filtered_df[normalize_features].std()
train_median = filtered_df[normalize_features].median()
default_fill: pd.DataFrame = (train_median-train_mean)/(train_std+1e-12)
# LOS info
los_info = {"los_mean": train_mean["LOS"].item(
), "los_std": train_std["LOS"].item(), "los_median": train_median["LOS"].item()}
# Calculate large los and threshold (optional, designed for covid-19 benchmark)
los_array = train_df.groupby('PatientID')['LOS'].max().values
los_p95 = np.percentile(los_array, 95)
los_p5 = np.percentile(los_array, 5)
filtered_los = los_array[(los_array >= los_p5) & (los_array <= los_p95)]
los_info.update({"large_los": los_p95.item(), "threshold": filtered_los.mean().item()*0.5})
# Z-score normalize the train, val, and test sets with train_mean and train_std
train_df[normalize_features] = (train_df[normalize_features] - train_mean) / (train_std+1e-12)
val_df[normalize_features] = (val_df[normalize_features] - train_mean) / (train_std+1e-12)
test_df[normalize_features] = (test_df[normalize_features] - train_mean) / (train_std+1e-12)
train_df.loc[:, normalize_features] = train_df.loc[:, normalize_features].applymap(filter_outlier)
val_df.loc[:, normalize_features] = val_df.loc[:, normalize_features].applymap(filter_outlier)
test_df.loc[:, normalize_features] = test_df.loc[:, normalize_features].applymap(filter_outlier)
return train_df, val_df, test_df, default_fill, los_info, train_mean, train_std
def normalize_df_with_statistics(df, normalize_features, train_mean, train_std):
df[normalize_features] = (df[normalize_features] - train_mean) / (train_std+1e-12)
df.loc[:, normalize_features] = df.loc[:, normalize_features].applymap(filter_outlier)
return df