Switch to unified view

a b/datasets/preprocess/tools.py
1
import math
2
3
import numpy as np
4
import pandas as pd
5
6
7
def calculate_data_existing_length(data):
8
    res = 0
9
    for i in data:
10
        if not pd.isna(i):
11
            res += 1
12
    return res
13
14
15
# elements in data are sorted in time ascending order
16
def fill_missing_value(data, to_fill_value=0):
17
    data_len = len(data)
18
    data_exist_len = calculate_data_existing_length(data)
19
    if data_len == data_exist_len:
20
        return data
21
    elif data_exist_len == 0:
22
        # data = [to_fill_value for _ in range(data_len)]
23
        for i in range(data_len):
24
            data[i] = to_fill_value
25
        return data
26
    if pd.isna(data[0]):
27
        # find the first non-nan value's position
28
        not_na_pos = 0
29
        for i in range(data_len):
30
            if not pd.isna(data[i]):
31
                not_na_pos = i
32
                break
33
        # fill element before the first non-nan value with median
34
        for i in range(not_na_pos):
35
            data[i] = to_fill_value
36
    # fill element after the first non-nan value
37
    for i in range(1, data_len):
38
        if pd.isna(data[i]):
39
            data[i] = data[i - 1]
40
    return data
41
42
43
def forward_fill_pipeline(
44
    df: pd.DataFrame,
45
    default_fill: pd.DataFrame,
46
    demographic_features: list[str],
47
    labtest_features: list[str],
48
    target_features: list[str],
49
    require_impute_features: list[str],
50
):
51
    grouped = df.groupby("PatientID")
52
53
    all_x = []
54
    all_y = []
55
    all_pid = []
56
    all_record_times = []  # List to store record times for each patient
57
    all_missing_masks = []
58
    
59
60
    for name, group in grouped:
61
        sorted_group = group.sort_values(by=["RecordTime"], ascending=True)
62
        patient_x = []
63
        patient_y = []
64
        patient_record_times = []  # List to store record times for the current patient
65
        patient_missing_masks = pd.isna(sorted_group[labtest_features]).values.astype(int).tolist()
66
67
        for f in require_impute_features:
68
            # if the f is not in the default_fill, then default to -1
69
            if f not in default_fill: # these are normally categorical features
70
                to_fill_value = -1
71
            else:
72
                to_fill_value = default_fill[f]
73
            # take median patient as the default to-fill missing value
74
            fill_missing_value(sorted_group[f].values, to_fill_value)
75
76
        for _, v in sorted_group.iterrows():
77
            patient_record_times.append(v['RecordTime'])
78
79
            target_values = []
80
            for f in target_features:
81
                target_values.append(v[f])
82
            patient_y.append(target_values)
83
            x = []
84
            for f in demographic_features + labtest_features:
85
                x.append(v[f])
86
            patient_x.append(x)
87
        all_x.append(patient_x)
88
        all_y.append(patient_y)
89
        all_pid.append(name)
90
        all_record_times.append(patient_record_times)
91
        all_missing_masks.append(patient_missing_masks)
92
    return all_x, all_y, all_pid, all_record_times, all_missing_masks
93
94
95
# outlier processing
96
def filter_outlier(element):
97
    if np.abs(float(element)) > 1e4:
98
        return 0
99
    else:
100
        return element
101
102
def normalize_dataframe(train_df, val_df, test_df, normalize_features, require_norm_later=True):
103
    # Calculate the quantiles
104
    q_low = train_df[normalize_features].quantile(0.05)
105
    q_high = train_df[normalize_features].quantile(0.95)
106
107
    # Filter the DataFrame based on the quantiles
108
    filtered_df = train_df[(train_df[normalize_features] > q_low) & (
109
        train_df[normalize_features] < q_high)]
110
111
    # Calculate the mean and standard deviation and median of the filtered data, also the default fill value
112
    train_mean = filtered_df[normalize_features].mean()
113
    train_std = filtered_df[normalize_features].std()
114
    train_median = filtered_df[normalize_features].median()
115
116
    # if certain feature's mean/std/median is NaN, then set it as 0. This feature will be filled with 0 in the following steps
117
    train_mean = train_mean.fillna(0)
118
    train_std = train_std.fillna(0)
119
    train_median = train_median.fillna(0)
120
121
    if require_norm_later:
122
        default_fill: pd.DataFrame = (train_median-train_mean)/(train_std+1e-12)
123
        # LOS info
124
        los_info = {"los_mean": train_mean["LOS"].item(
125
        ), "los_std": train_std["LOS"].item(), "los_median": train_median["LOS"].item()}
126
127
        # Calculate large los and threshold (optional, designed for covid-19 benchmark)
128
        los_array = train_df.groupby('PatientID')['LOS'].max().values
129
        los_p95 = np.percentile(los_array, 95)
130
        los_p5 = np.percentile(los_array, 5)
131
        filtered_los = los_array[(los_array >= los_p5) & (los_array <= los_p95)]
132
        los_info.update({"large_los": los_p95.item(), "threshold": filtered_los.mean().item()*0.5})
133
134
135
        # Z-score normalize the train, val, and test sets with train_mean and train_std
136
        train_df.loc[:, normalize_features] = (train_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
137
        val_df.loc[:, normalize_features] = (val_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
138
        test_df.loc[:, normalize_features] = (test_df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
139
140
        train_df.loc[:, normalize_features] = train_df.loc[:, normalize_features].map(filter_outlier)
141
        val_df.loc[:, normalize_features] = val_df.loc[:, normalize_features].map(filter_outlier)
142
        test_df.loc[:, normalize_features] = test_df.loc[:, normalize_features].map(filter_outlier)
143
144
        return train_df, val_df, test_df, default_fill, los_info, train_mean, train_std
145
146
    else:
147
        default_fill: pd.DataFrame = train_median
148
        return default_fill
149
150
def normalize_df_with_statistics(df, normalize_features, train_mean, train_std):
151
    df.loc[:, normalize_features] = (df.loc[:, normalize_features] - train_mean) / (train_std+1e-12)
152
    df.loc[:, normalize_features] = df.loc[:, normalize_features].map(filter_outlier)
153
    return df
154