|
a |
|
b/ML_Models_Pipeline.py |
|
|
1 |
# Clustering Analysis |
|
|
2 |
|
|
|
3 |
##1 Hierarchical Clustering |
|
|
4 |
|
|
|
5 |
plt.figure(figsize=(15, 10)) |
|
|
6 |
linkage_matrix = linkage(X_train_outliers_removed, method='ward') |
|
|
7 |
dendrogram(linkage_matrix) |
|
|
8 |
plt.title('Hierarchical Clustering Dendrogram') |
|
|
9 |
plt.show() |
|
|
10 |
k = 2 |
|
|
11 |
clusters = fcluster(linkage_matrix, k, criterion='maxclust') |
|
|
12 |
|
|
|
13 |
X_train_scaled |
|
|
14 |
|
|
|
15 |
##2 K-means Clustering |
|
|
16 |
|
|
|
17 |
num_clusters = 2 |
|
|
18 |
kmeans = KMeans(n_clusters=num_clusters, random_state=42) |
|
|
19 |
kmeans.fit(X_train_prepared) |
|
|
20 |
# Assign cluster labels to the original data |
|
|
21 |
X_train['cluster'] = kmeans.labels_ |
|
|
22 |
# Combine the cluster labels with the original training set |
|
|
23 |
patient_df_with_clusters = patient_df.loc[X_train.index].copy() |
|
|
24 |
patient_df_with_clusters['cluster'] = X_train['cluster'] |
|
|
25 |
|
|
|
26 |
# Explore characteristics of each cluster |
|
|
27 |
for cluster_label in range(num_clusters): |
|
|
28 |
cluster_data = patient_df_with_clusters[patient_df_with_clusters['cluster'] == cluster_label] |
|
|
29 |
print(f'\nCluster {cluster_label} Characteristics:') |
|
|
30 |
print(cluster_data.describe()) |
|
|
31 |
|
|
|
32 |
# Calculate the mean of each feature for each cluster |
|
|
33 |
# Exclude non-numeric columns from the calculation |
|
|
34 |
cluster_means = patient_df_with_clusters.select_dtypes(include=['number']).groupby('cluster').mean() |
|
|
35 |
print("\nMean Value of Features for Each Cluster:") |
|
|
36 |
print(cluster_means) |
|
|
37 |
|
|
|
38 |
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=kmeans.labels_, cmap='viridis') |
|
|
39 |
plt.title('Clustering Analysis') |
|
|
40 |
plt.xlabel('Principal Component 1') |
|
|
41 |
plt.ylabel('Principal Component 2') |
|
|
42 |
plt.show() |
|
|
43 |
|
|
|
44 |
# Classifier: The Model (Logistic Regression, RandomForestClassifier, GradientBoostingClassifier, SVM) |
|
|
45 |
|
|
|
46 |
## Run the below code (line 47 to 91) in one chunk |
|
|
47 |
|
|
|
48 |
# Prepare the classifiers |
|
|
49 |
classifiers = { |
|
|
50 |
"Logistic Regression": LogisticRegression(), |
|
|
51 |
"Random Forest": RandomForestClassifier(random_state=42), |
|
|
52 |
"Gradient Boosting": GradientBoostingClassifier(random_state=42), |
|
|
53 |
"Support Vector Machine": SVC(probability=True, random_state=42) |
|
|
54 |
} |
|
|
55 |
# Train and evaluate the classifiers |
|
|
56 |
results = {} |
|
|
57 |
for name, clf in classifiers.items(): |
|
|
58 |
# Train the classifier |
|
|
59 |
clf.fit(X_train_scaled, y_train) |
|
|
60 |
# Predict on the test set |
|
|
61 |
y_pred = clf.predict(X_test_scaled) |
|
|
62 |
y_proba = clf.predict_proba(X_test_scaled)[:, 1] |
|
|
63 |
# Evaluate the classifier |
|
|
64 |
accuracy = accuracy_score(y_test, y_pred) |
|
|
65 |
precision = precision_score(y_test, y_pred) |
|
|
66 |
recall = recall_score(y_test, y_pred) |
|
|
67 |
f1 = f1_score(y_test, y_pred) |
|
|
68 |
roc_auc = roc_auc_score(y_test, y_proba) |
|
|
69 |
cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5) |
|
|
70 |
# Store results |
|
|
71 |
results[name] = { |
|
|
72 |
"Accuracy": accuracy, |
|
|
73 |
"Precision": precision, |
|
|
74 |
"Recall": recall, |
|
|
75 |
"F1 Score": f1, |
|
|
76 |
"ROC-AUC Score": roc_auc, |
|
|
77 |
"Cross-validation scores": cv_scores.tolist(), # Convert to list for printing |
|
|
78 |
|
|
|
79 |
} |
|
|
80 |
# Print the performance |
|
|
81 |
print(f"Results for {name}:") |
|
|
82 |
print(f"Accuracy: {accuracy:.4f}") |
|
|
83 |
print(f"Precision: {precision:.4f}") |
|
|
84 |
print(f"Recall: {recall:.4f}") |
|
|
85 |
print(f"F1 Score: {f1:.4f}") |
|
|
86 |
print(f"ROC-AUC Score: {roc_auc:.4f}\n") |
|
|
87 |
print(f"Cross-validation scores: {cv_scores}\n") |
|
|
88 |
|
|
|
89 |
# Compare results |
|
|
90 |
results_df = pd.DataFrame(results).transpose() |
|
|
91 |
print("Comparison of Classifiers:") |
|
|
92 |
print(results_df) |
|
|
93 |
|
|
|
94 |
## Finding out the most important feature? |
|
|
95 |
|
|
|
96 |
# Feature importance |
|
|
97 |
# Train the classifier |
|
|
98 |
clf =GradientBoostingClassifier(random_state=42) |
|
|
99 |
clf.fit(X_train_prepared, y_train) |
|
|
100 |
|
|
|
101 |
feature_importance = clf.feature_importances_ |
|
|
102 |
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]] |
|
|
103 |
print("Most Important Feature:", important_features[0]) |
|
|
104 |
|
|
|
105 |
# 6.2 Exclude the best feature and retrain the classifier |
|
|
106 |
X_train_subset = X_train_prepared.drop(important_features[0], axis=1) |
|
|
107 |
X_test_subset = X_test_prepared.drop(important_features[0], axis=1) |
|
|
108 |
clf_subset = RandomForestClassifier(random_state=42) |
|
|
109 |
clf_subset.fit(X_train_subset, y_train) |
|
|
110 |
|
|
|
111 |
y_pred_test_subset = clf_subset.predict(X_test_subset) |
|
|
112 |
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset)) |
|
|
113 |
|
|
|
114 |
# Train the classifier |
|
|
115 |
clf =RandomForestClassifier(random_state=42) |
|
|
116 |
clf.fit(X_train_prepared, y_train) |
|
|
117 |
|
|
|
118 |
feature_importance = clf.feature_importances_ |
|
|
119 |
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]] |
|
|
120 |
|
|
|
121 |
print("Most Important Feature:", important_features[0]) |
|
|
122 |
|
|
|
123 |
# 6.2 Exclude the best feature and retrain the classifier |
|
|
124 |
X_train_subset = X_train_prepared.drop(important_features[0], axis=1) |
|
|
125 |
X_test_subset = X_test_prepared.drop(important_features[0], axis=1) |
|
|
126 |
|
|
|
127 |
clf_subset = RandomForestClassifier(random_state=42) |
|
|
128 |
clf_subset.fit(X_train_subset, y_train) |
|
|
129 |
|
|
|
130 |
y_pred_test_subset = clf_subset.predict(X_test_subset) |
|
|
131 |
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset)) |