covid-ehr-benchmarks / Git / Diff of /datasets/utils/tools.py

Models:
philipB/
covid-ehr-benchmarks
Downloads: 1
Diff of /datasets/utils/tools.py [000000] .. [d6904d]
Switch to side-by-side view

--- a
+++ b/datasets/utils/tools.py
@@ -0,0 +1,103 @@
+import pandas as pd
+
+
+def df_column_switch(df: pd.DataFrame, column1, column2):
+    i = list(df.columns)
+    a, b = i.index(column1), i.index(column2)
+    i[b], i[a] = i[a], i[b]
+    df = df[i]
+    return df
+
+
+def calculate_data_existing_length(data):
+    res = 0
+    for i in data:
+        if not pd.isna(i):
+            res += 1
+    return res
+
+
+# elements in data are sorted in time ascending order
+def fill_missing_value(data, to_fill_value=0):
+    data_len = len(data)
+    data_exist_len = calculate_data_existing_length(data)
+    if data_len == data_exist_len:
+        return data
+    elif data_exist_len == 0:
+        # data = [to_fill_value for _ in range(data_len)]
+        for i in range(data_len):
+            data[i] = to_fill_value
+        return data
+    if pd.isna(data[0]):
+        # find the first non-nan value's position
+        not_na_pos = 0
+        for i in range(data_len):
+            if not pd.isna(data[i]):
+                not_na_pos = i
+                break
+        # fill element before the first non-nan value with median
+        for i in range(not_na_pos):
+            data[i] = to_fill_value
+    # fill element after the first non-nan value
+    for i in range(1, data_len):
+        if pd.isna(data[i]):
+            data[i] = data[i - 1]
+    return data
+
+
+def forward_fill_pipeline(
+    df: pd.DataFrame,
+    default_fill: pd.DataFrame,
+    demographic_features: list[str],
+    labtest_features: list[str],
+):
+    grouped = df.groupby("PatientID")
+
+    all_x = []
+    all_y = []
+    all_pid = []
+
+    for name, group in grouped:
+        sorted_group = group.sort_values(by=["RecordTime"], ascending=True)
+        patient_x = []
+        patient_y = []
+
+        for f in ["Age"] + labtest_features:
+            to_fill_value = default_fill[f]
+            # take median patient as the default to-fill missing value
+            fill_missing_value(sorted_group[f].values, to_fill_value)
+
+        for _, v in sorted_group.iterrows():
+            patient_y.append([v["Outcome"], v["LOS"]])
+            x = []
+            for f in demographic_features + labtest_features:
+                x.append(v[f])
+            patient_x.append(x)
+        all_x.append(patient_x)
+        all_y.append(patient_y)
+        all_pid.append(name)
+    return all_x, all_y, all_pid
+
+def normalize_dataframe(train_df, val_df, test_df, normalize_features):
+    # Calculate the quantiles
+    q_low = train_df[normalize_features].quantile(0.05)
+    q_high = train_df[normalize_features].quantile(0.95)
+
+    # Filter the DataFrame based on the quantiles
+    filtered_df = train_df[(train_df[normalize_features] > q_low) & (train_df[normalize_features] < q_high)]
+
+    # Calculate the mean and standard deviation and median of the filtered data, also the default fill value
+    train_mean = filtered_df[normalize_features].mean()
+    train_std = filtered_df[normalize_features].std()
+    train_median = filtered_df[normalize_features].median()
+    default_fill: pd.DataFrame = (train_median-train_mean)/(train_std+1e-12)
+
+    # LOS info
+    LOS_info = {"mean": train_mean["LOS"], "std": train_std["LOS"], "median": train_median["LOS"]}
+
+    # Z-score normalize the train, val, and test sets with train_mean and train_std
+    train_df[normalize_features] = (train_df[normalize_features] - train_mean) / (train_std+1e-12)
+    val_df[normalize_features] = (val_df[normalize_features] - train_mean) / (train_std+1e-12)
+    test_df[normalize_features] = (test_df[normalize_features] - train_mean) / (train_std+1e-12)
+
+    return train_df, val_df, test_df, default_fill, LOS_info
\ No newline at end of file