[d6904d]: / datasets / utils / tools.py

Download this file

103 lines (85 with data), 3.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
import pandas as pd
def df_column_switch(df: pd.DataFrame, column1, column2):
i = list(df.columns)
a, b = i.index(column1), i.index(column2)
i[b], i[a] = i[a], i[b]
df = df[i]
return df
def calculate_data_existing_length(data):
res = 0
for i in data:
if not pd.isna(i):
res += 1
return res
# elements in data are sorted in time ascending order
def fill_missing_value(data, to_fill_value=0):
data_len = len(data)
data_exist_len = calculate_data_existing_length(data)
if data_len == data_exist_len:
return data
elif data_exist_len == 0:
# data = [to_fill_value for _ in range(data_len)]
for i in range(data_len):
data[i] = to_fill_value
return data
if pd.isna(data[0]):
# find the first non-nan value's position
not_na_pos = 0
for i in range(data_len):
if not pd.isna(data[i]):
not_na_pos = i
break
# fill element before the first non-nan value with median
for i in range(not_na_pos):
data[i] = to_fill_value
# fill element after the first non-nan value
for i in range(1, data_len):
if pd.isna(data[i]):
data[i] = data[i - 1]
return data
def forward_fill_pipeline(
df: pd.DataFrame,
default_fill: pd.DataFrame,
demographic_features: list[str],
labtest_features: list[str],
):
grouped = df.groupby("PatientID")
all_x = []
all_y = []
all_pid = []
for name, group in grouped:
sorted_group = group.sort_values(by=["RecordTime"], ascending=True)
patient_x = []
patient_y = []
for f in ["Age"] + labtest_features:
to_fill_value = default_fill[f]
# take median patient as the default to-fill missing value
fill_missing_value(sorted_group[f].values, to_fill_value)
for _, v in sorted_group.iterrows():
patient_y.append([v["Outcome"], v["LOS"]])
x = []
for f in demographic_features + labtest_features:
x.append(v[f])
patient_x.append(x)
all_x.append(patient_x)
all_y.append(patient_y)
all_pid.append(name)
return all_x, all_y, all_pid
def normalize_dataframe(train_df, val_df, test_df, normalize_features):
# Calculate the quantiles
q_low = train_df[normalize_features].quantile(0.05)
q_high = train_df[normalize_features].quantile(0.95)
# Filter the DataFrame based on the quantiles
filtered_df = train_df[(train_df[normalize_features] > q_low) & (train_df[normalize_features] < q_high)]
# Calculate the mean and standard deviation and median of the filtered data, also the default fill value
train_mean = filtered_df[normalize_features].mean()
train_std = filtered_df[normalize_features].std()
train_median = filtered_df[normalize_features].median()
default_fill: pd.DataFrame = (train_median-train_mean)/(train_std+1e-12)
# LOS info
LOS_info = {"mean": train_mean["LOS"], "std": train_std["LOS"], "median": train_median["LOS"]}
# Z-score normalize the train, val, and test sets with train_mean and train_std
train_df[normalize_features] = (train_df[normalize_features] - train_mean) / (train_std+1e-12)
val_df[normalize_features] = (val_df[normalize_features] - train_mean) / (train_std+1e-12)
test_df[normalize_features] = (test_df[normalize_features] - train_mean) / (train_std+1e-12)
return train_df, val_df, test_df, default_fill, LOS_info