[efae2c]: / DataPreprocessing.py

Download this file

191 lines (147 with data), 7.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#4 Correlation Analysis
* We want to see if there is any correlation between the features (Not all the feature are numeric so we can only use the numeric ones)
* The code is generating a heatmap and the results will include numbers: these numbers shows the correlation coefficient between the corresponding pair of features
* The colors are showing the the magnitude of the correlation coefficient. Darker colors represent stronger correlations, while lighter colors represent weaker correlations
correlation_matrix = patient_df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()
#5 Train and Test Split
X = patient_df.drop(['dissease'], axis=1)
y = patient_df['dissease']
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=15)
X_train
#5.1 Dealing with Missing Values
# Dealing with missing data in patient_df:
# Identify numeric and non-numeric columns
numeric_columns_df = patient_df.select_dtypes(include='number').columns
non_numeric_columns_df = patient_df.columns.difference(numeric_columns_df)
# Convert numeric columns to numeric type
patient_df[numeric_columns_df] = patient_df[numeric_columns_df].apply(pd.to_numeric, errors='coerce')
# Handle non-numeric columns (drop them for simplicity)
patient_df = patient_df.drop(columns=non_numeric_columns_df)
# Fill missing values for numeric columns
patient_df.loc[:, numeric_columns_df] = patient_df[numeric_columns_df].fillna(patient_df[numeric_columns_df].mean())
#5.2 Dealing with missing data in the X_train and X_test:
# Imputation
numeric_columns = X_train.select_dtypes(include=['number']).columns
categorical_columns = X_train.columns.difference(numeric_columns)
imputer_numeric = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='most_frequent')
X_train_numeric = pd.DataFrame(imputer_numeric.fit_transform(X_train[numeric_columns]), columns=numeric_columns)
X_train_categorical = pd.DataFrame(imputer_categorical.fit_transform(X_train[categorical_columns]), columns=categorical_columns)
X_test_numeric = pd.DataFrame(imputer_numeric.transform(X_test[numeric_columns]), columns=numeric_columns)
X_test_categorical = pd.DataFrame(imputer_categorical.transform(X_test[categorical_columns]), columns=categorical_columns)
# Concatenating imputed data
X_train_imputed = pd.concat([X_train_numeric, X_train_categorical], axis=1)
X_test_imputed = pd.concat([X_test_numeric, X_test_categorical], axis=1)
# One-Hot Encoding for categorical variables
encoder = OneHotEncoder(sparse=False)
X_train_encoded = pd.DataFrame(encoder.fit_transform(X_train_imputed[categorical_columns]), columns=encoder.get_feature_names_out())
X_test_encoded = pd.DataFrame(encoder.transform(X_test_imputed[categorical_columns]), columns=encoder.get_feature_names_out())
X_train_prepared = pd.concat([X_train_imputed.drop(categorical_columns, axis=1), X_train_encoded], axis=1)
X_test_prepared = pd.concat([X_test_imputed.drop(categorical_columns, axis=1), X_test_encoded], axis=1)
# Standardise data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_prepared)
X_test_scaled = scaler.transform(X_test_prepared)
# PCA
pca = PCA(n_components=2)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
# Visualising PCA results
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=y_train, cmap='viridis')
plt.title('PCA Analysis - Training Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
plt.scatter(X_test_pca[:, 0], X_test_pca[:, 1], c=y_test, cmap='plasma')
plt.title('PCA Analysis - Testing Data')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
X_train_prepared
X_train_encoded
categorical_columns
# Outliers Removal
# Outlier Removal on patient_df
# Performing outlier detection using Z-scores
p_num = patient_df[['age', 'bmi', 'alcohol_misuse', 'health_gen', 'health_ment', 'health_phys']]
z1 = np.abs(stats.zscore(p_num))
print('\nZ-Score Array:\n', z1)
print(p_num.shape)
threshold = 3
print('\nOutliers:\n', np.where(z1 > threshold))
p_num
# Ploting the all the outliers
plt.figure(figsize=(10,7))
sns.boxplot(data=p_num)
plt.title('Outlier Visualization', fontsize=20)
plt.xticks(rotation=90)
plt.show()
# Removing outliers
p_outliers = (z1 < threshold).all(axis=1)
print(p_outliers.shape)
print('\nAfter Removing Outliers:\n', np.where(z1 < threshold))
p_outliers_removed = p_num[p_outliers]
p_outliers
p_outliers_removed
# Outlier Removal on X_train
# Splitting a training dataset
X_train_num = X_train_imputed[['age', 'bmi', 'alcohol_misuse', 'health_gen', 'health_ment', 'health_phys']]
X_train_cat = X_train_imputed[['gender', 'high_bp', 'high_chol', 'chol_check', 'history_smoking', 'history_stroke', 'history_heart_disease', 'amount_activity', 'fruits', 'vegetables', 'walking_diff']]
X_train_num
# Calculating the Z-scores for each element in the X_train_num
z1 = np.abs(stats.zscore(X_train_num))
print('\nZ-Score Array:\n', z1)
print(X_train_num.shape)
threshold = 3
print('\nOutliers:\n', np.where(z1 < threshold))
# Identifying outliers in X_train
X_train_outliers = (z1 < threshold).all(axis=1)
print(X_train_outliers.shape)
X_train_outliers
# Remove the outliers in X_train
X_train_outliers_removed = X_train_num[X_train_outliers]
print('\nAfter Removing Outliers:\n', np.where(z1 < threshold))
X_train_outliers_removed
# Ploting the train data outliers
plt.figure(figsize=(20,10))
sns.boxplot(data=X_train_num)
plt.title('Trian Data Outlier Visualization', fontsize=20)
plt.xticks(rotation=90)
plt.show()
print('\nAfter Removing Outliers:\n', np.where(z1 < threshold))
# Ploting the train data after removing the outliers
plt.figure(figsize=(20,10))
sns.boxplot(data=X_train_outliers_removed)
plt.title('Train Data After Outlier Removal', fontsize=20)
plt.xticks(rotation=90)
plt.show()
# Outlier Removal on X_test
# Calculating the Z-scores for each element in the X_test_num
X_test_num = X_test_imputed[['age', 'bmi', 'alcohol_misuse', 'health_gen', 'health_ment', 'health_phys']]
X_test_cat = X_test_imputed[['gender', 'high_bp', 'high_chol', 'chol_check', 'history_smoking', 'history_stroke', 'history_heart_disease', 'amount_activity', 'fruits', 'vegetables', 'walking_diff']]
z2 = np.abs(stats.zscore(X_test_prepared))
print('\nZ-Score Array:\n', z2)
print(X_test_num.shape)
threshold = 3
print('\nOutliers:\n', np.where(z2 < threshold))
plt.figure(figsize=(20,10))
sns.boxplot(data=X_test_num)
plt.title('Test Data Outlier Visualization', fontsize=20)
plt.xticks(rotation=90)
plt.show()
# Identifying outliers in X_test
X_test_outliers = (z2 < threshold).all(axis=1)
print(X_test_outliers.shape)
z2 = np.abs(stats.zscore(X_test_outliers))
print('\nAfter Removing Outliers:\n', np.where(z2 < threshold))
X_test_outliers_removed = X_test_prepared[X_test_outliers]
print('\nAfter Removing Outliers:\n', np.where(z1 < threshold))
X_test_outliers_removed
plt.figure(figsize=(20,10))
sns.boxplot(data=X_test_outliers_removed)
plt.title('Test Data After Outlier Removal', fontsize=20)
plt.xticks(rotation=90)
plt.show()