Diff of /my_util.py [000000] .. [e6e569]

Switch to side-by-side view

--- a
+++ b/my_util.py
@@ -0,0 +1,89 @@
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+from scipy import odr
+import seaborn as sns
+from enum import Enum
+
+
+
+def df_to_corr_matrix(df, sep=200, annot=True, size_factor=1):
+  correlation_matrix = df.corr()
+
+  mask = np.zeros_like(correlation_matrix)
+  print(correlation_matrix.shape)
+  mask[np.triu_indices_from(mask)] = True
+  plt.figure(figsize = (size_factor*10,size_factor*8))
+  cmap = sns.diverging_palette(260, 10, sep=sep, as_cmap=True)
+  sns.heatmap(correlation_matrix, cmap = cmap, mask=mask, vmin=-1, vmax=1, annot=annot)
+  plt.show()
+
+def remove_outliers(X, y, selected_features):
+  class ReplaceMethod(Enum):
+      SMALLEST = 0
+      LARGEST = 1
+
+  replace_list = [
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_firstorder_10Percentile", "outcome": 0, "threshold": 1.678166},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_firstorder_10Percentile", "outcome": 1, "threshold": -0.617352},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_ngtdm_Busyness", "outcome": 1, "threshold": 838.677442},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_gldm_DependenceEntropy", "outcome": 1, "threshold": 2.478963},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_firstorder_Skewness", "outcome": 1, "threshold": 0.545105},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_firstorder_Skewness", "outcome": 0, "threshold": 0.767485},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_firstorder_Skewness", "outcome": 1, "threshold": -0.995207},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_glrlm_ShortRunHighGrayLevelEmphasis", "outcome": 1, "threshold": 0.363247},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glrlm_ShortRunHighGrayLevelEmphasis", "outcome": 0, "threshold": 0.838612},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_gldm_SmallDependenceEmphasis", "outcome": 0, "threshold": 0.011321},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_shape_MajorAxisLength", "outcome": 0, "threshold": 162.863366},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glrlm_LongRunLowGrayLevelEmphasis", "outcome": 1, "threshold": 39.241005},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glrlm_LongRunLowGrayLevelEmphasis", "outcome": 0, "threshold": 90.572934},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_firstorder_Minimum", "outcome": 0, "threshold": -2.346176},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_firstorder_Minimum", "outcome": 1, "threshold": -2.052288},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_shape_Maximum2DDiameterRow", "outcome": 1, "threshold": 77.252832},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_shape_Maximum2DDiameterRow", "outcome": 0, "threshold": 124.193398},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_shape_SurfaceVolumeRatio", "outcome": 0, "threshold": 0.723904},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_shape_SurfaceVolumeRatio", "outcome": 1, "threshold": 0.772898},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_shape_SurfaceVolumeRatio", "outcome": 1, "threshold": 0.215198},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_shape_LeastAxisLength", "outcome": 0, "threshold": 52.226330},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_shape_LeastAxisLength", "outcome": 1, "threshold": 41.589009},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_shape_LeastAxisLength", "outcome": 1, "threshold": 8.531971},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_glcm_Autocorrelation", "outcome": 0, "threshold": 3.040814},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_glcm_Autocorrelation", "outcome": 1, "threshold": 3.297653},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glszm_SizeZoneNonUniformityNormalized", "outcome": 0, "threshold": 0.653333},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glszm_SizeZoneNonUniformityNormalized", "outcome": 1, "threshold": 0.500000},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glszm_SmallAreaEmphasis", "outcome": 1, "threshold": 0.643301},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_shape_Elongation", "outcome": 0, "threshold": 0.299156},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_shape_Elongation", "outcome": 1, "threshold": 0.350000},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_firstorder_Kurtosis", "outcome": 1, "threshold": 4.760064},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_firstorder_Kurtosis", "outcome": 0, "threshold": 5.157534},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glszm_GrayLevelNonUniformity", "outcome": 1, "threshold": 204.009709},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glszm_GrayLevelNonUniformity", "outcome": 0, "threshold": 290.006849},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_glcm_Imc1", "outcome": 1, "threshold": -0.399542},
+      {"replace_method": ReplaceMethod.SMALLEST, "feature": "original_glcm_Imc1", "outcome": 0, "threshold": -0.451831},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_firstorder_90Percentile", "outcome": 0, "threshold": 4.883197},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_firstorder_90Percentile", "outcome": 1, "threshold": 4.087205},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glcm_Correlation", "outcome": 1, "threshold": 0.569794},
+      {"replace_method": ReplaceMethod.LARGEST, "feature": "original_glcm_Correlation", "outcome": 0, "threshold": 0.704593},
+  ]
+
+  data = pd.concat([y, X], axis=1)
+
+  for task in replace_list:
+      outcome = task["outcome"]
+      feature = task["feature"]
+      threshold = task["threshold"]
+      num_of_replace = 0
+      if feature not in X.columns:
+          continue
+      if task["replace_method"] == ReplaceMethod.LARGEST:
+          num_of_replace = len(data.loc[(data["pCR (outcome)"] == outcome) & (data[feature] > threshold), feature])
+          data.loc[(data["pCR (outcome)"] == outcome) & (data[feature] > threshold), feature] = threshold
+      elif task["replace_method"] == ReplaceMethod.SMALLEST:
+          num_of_replace = len(data.loc[(data["pCR (outcome)"] == outcome) & (data[feature] < threshold), feature])
+          data.loc[(data["pCR (outcome)"] == outcome) & (data[feature] < threshold), feature] = threshold
+      print(f"Replaced {num_of_replace} records in {feature}[{outcome}] to {threshold}")
+
+  X = data[selected_features]
+  y = data["pCR (outcome)"]
+
+  return X, y
\ No newline at end of file