--- a +++ b/datasets/utils/tools.py @@ -0,0 +1,103 @@ +import pandas as pd + + +def df_column_switch(df: pd.DataFrame, column1, column2): + i = list(df.columns) + a, b = i.index(column1), i.index(column2) + i[b], i[a] = i[a], i[b] + df = df[i] + return df + + +def calculate_data_existing_length(data): + res = 0 + for i in data: + if not pd.isna(i): + res += 1 + return res + + +# elements in data are sorted in time ascending order +def fill_missing_value(data, to_fill_value=0): + data_len = len(data) + data_exist_len = calculate_data_existing_length(data) + if data_len == data_exist_len: + return data + elif data_exist_len == 0: + # data = [to_fill_value for _ in range(data_len)] + for i in range(data_len): + data[i] = to_fill_value + return data + if pd.isna(data[0]): + # find the first non-nan value's position + not_na_pos = 0 + for i in range(data_len): + if not pd.isna(data[i]): + not_na_pos = i + break + # fill element before the first non-nan value with median + for i in range(not_na_pos): + data[i] = to_fill_value + # fill element after the first non-nan value + for i in range(1, data_len): + if pd.isna(data[i]): + data[i] = data[i - 1] + return data + + +def forward_fill_pipeline( + df: pd.DataFrame, + default_fill: pd.DataFrame, + demographic_features: list[str], + labtest_features: list[str], +): + grouped = df.groupby("PatientID") + + all_x = [] + all_y = [] + all_pid = [] + + for name, group in grouped: + sorted_group = group.sort_values(by=["RecordTime"], ascending=True) + patient_x = [] + patient_y = [] + + for f in ["Age"] + labtest_features: + to_fill_value = default_fill[f] + # take median patient as the default to-fill missing value + fill_missing_value(sorted_group[f].values, to_fill_value) + + for _, v in sorted_group.iterrows(): + patient_y.append([v["Outcome"], v["LOS"]]) + x = [] + for f in demographic_features + labtest_features: + x.append(v[f]) + patient_x.append(x) + all_x.append(patient_x) + all_y.append(patient_y) + all_pid.append(name) + return all_x, all_y, all_pid + +def normalize_dataframe(train_df, val_df, test_df, normalize_features): + # Calculate the quantiles + q_low = train_df[normalize_features].quantile(0.05) + q_high = train_df[normalize_features].quantile(0.95) + + # Filter the DataFrame based on the quantiles + filtered_df = train_df[(train_df[normalize_features] > q_low) & (train_df[normalize_features] < q_high)] + + # Calculate the mean and standard deviation and median of the filtered data, also the default fill value + train_mean = filtered_df[normalize_features].mean() + train_std = filtered_df[normalize_features].std() + train_median = filtered_df[normalize_features].median() + default_fill: pd.DataFrame = (train_median-train_mean)/(train_std+1e-12) + + # LOS info + LOS_info = {"mean": train_mean["LOS"], "std": train_std["LOS"], "median": train_median["LOS"]} + + # Z-score normalize the train, val, and test sets with train_mean and train_std + train_df[normalize_features] = (train_df[normalize_features] - train_mean) / (train_std+1e-12) + val_df[normalize_features] = (val_df[normalize_features] - train_mean) / (train_std+1e-12) + test_df[normalize_features] = (test_df[normalize_features] - train_mean) / (train_std+1e-12) + + return train_df, val_df, test_df, default_fill, LOS_info \ No newline at end of file