[efae2c]: / ML_Models_Pipeline.py

Download this file

132 lines (107 with data), 4.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# Clustering Analysis
##1 Hierarchical Clustering
plt.figure(figsize=(15, 10))
linkage_matrix = linkage(X_train_outliers_removed, method='ward')
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()
k = 2
clusters = fcluster(linkage_matrix, k, criterion='maxclust')
X_train_scaled
##2 K-means Clustering
num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_train_prepared)
# Assign cluster labels to the original data
X_train['cluster'] = kmeans.labels_
# Combine the cluster labels with the original training set
patient_df_with_clusters = patient_df.loc[X_train.index].copy()
patient_df_with_clusters['cluster'] = X_train['cluster']
# Explore characteristics of each cluster
for cluster_label in range(num_clusters):
cluster_data = patient_df_with_clusters[patient_df_with_clusters['cluster'] == cluster_label]
print(f'\nCluster {cluster_label} Characteristics:')
print(cluster_data.describe())
# Calculate the mean of each feature for each cluster
# Exclude non-numeric columns from the calculation
cluster_means = patient_df_with_clusters.select_dtypes(include=['number']).groupby('cluster').mean()
print("\nMean Value of Features for Each Cluster:")
print(cluster_means)
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=kmeans.labels_, cmap='viridis')
plt.title('Clustering Analysis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()
# Classifier: The Model (Logistic Regression, RandomForestClassifier, GradientBoostingClassifier, SVM)
## Run the below code (line 47 to 91) in one chunk
# Prepare the classifiers
classifiers = {
"Logistic Regression": LogisticRegression(),
"Random Forest": RandomForestClassifier(random_state=42),
"Gradient Boosting": GradientBoostingClassifier(random_state=42),
"Support Vector Machine": SVC(probability=True, random_state=42)
}
# Train and evaluate the classifiers
results = {}
for name, clf in classifiers.items():
# Train the classifier
clf.fit(X_train_scaled, y_train)
# Predict on the test set
y_pred = clf.predict(X_test_scaled)
y_proba = clf.predict_proba(X_test_scaled)[:, 1]
# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_proba)
cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)
# Store results
results[name] = {
"Accuracy": accuracy,
"Precision": precision,
"Recall": recall,
"F1 Score": f1,
"ROC-AUC Score": roc_auc,
"Cross-validation scores": cv_scores.tolist(), # Convert to list for printing
}
# Print the performance
print(f"Results for {name}:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC-AUC Score: {roc_auc:.4f}\n")
print(f"Cross-validation scores: {cv_scores}\n")
# Compare results
results_df = pd.DataFrame(results).transpose()
print("Comparison of Classifiers:")
print(results_df)
## Finding out the most important feature?
# Feature importance
# Train the classifier
clf =GradientBoostingClassifier(random_state=42)
clf.fit(X_train_prepared, y_train)
feature_importance = clf.feature_importances_
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]]
print("Most Important Feature:", important_features[0])
# 6.2 Exclude the best feature and retrain the classifier
X_train_subset = X_train_prepared.drop(important_features[0], axis=1)
X_test_subset = X_test_prepared.drop(important_features[0], axis=1)
clf_subset = RandomForestClassifier(random_state=42)
clf_subset.fit(X_train_subset, y_train)
y_pred_test_subset = clf_subset.predict(X_test_subset)
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset))
# Train the classifier
clf =RandomForestClassifier(random_state=42)
clf.fit(X_train_prepared, y_train)
feature_importance = clf.feature_importances_
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]]
print("Most Important Feature:", important_features[0])
# 6.2 Exclude the best feature and retrain the classifier
X_train_subset = X_train_prepared.drop(important_features[0], axis=1)
X_test_subset = X_test_prepared.drop(important_features[0], axis=1)
clf_subset = RandomForestClassifier(random_state=42)
clf_subset.fit(X_train_subset, y_train)
y_pred_test_subset = clf_subset.predict(X_test_subset)
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset))