Diff of /ML_Models_Pipeline.py [000000] .. [efae2c]

Switch to unified view

a b/ML_Models_Pipeline.py
1
# Clustering Analysis
2
3
##1 Hierarchical Clustering
4
5
plt.figure(figsize=(15, 10))
6
linkage_matrix = linkage(X_train_outliers_removed, method='ward')
7
dendrogram(linkage_matrix)
8
plt.title('Hierarchical Clustering Dendrogram')
9
plt.show()
10
k = 2
11
clusters = fcluster(linkage_matrix, k, criterion='maxclust')
12
13
X_train_scaled
14
15
##2 K-means Clustering
16
17
num_clusters = 2
18
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
19
kmeans.fit(X_train_prepared)
20
# Assign cluster labels to the original data
21
X_train['cluster'] = kmeans.labels_
22
# Combine the cluster labels with the original training set
23
patient_df_with_clusters = patient_df.loc[X_train.index].copy()
24
patient_df_with_clusters['cluster'] = X_train['cluster']
25
26
# Explore characteristics of each cluster
27
for cluster_label in range(num_clusters):
28
    cluster_data = patient_df_with_clusters[patient_df_with_clusters['cluster'] == cluster_label]
29
    print(f'\nCluster {cluster_label} Characteristics:')
30
    print(cluster_data.describe())
31
32
# Calculate the mean of each feature for each cluster
33
# Exclude non-numeric columns from the calculation
34
cluster_means = patient_df_with_clusters.select_dtypes(include=['number']).groupby('cluster').mean()
35
print("\nMean Value of Features for Each Cluster:")
36
print(cluster_means)
37
38
plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=kmeans.labels_, cmap='viridis')
39
plt.title('Clustering Analysis')
40
plt.xlabel('Principal Component 1')
41
plt.ylabel('Principal Component 2')
42
plt.show()
43
44
# Classifier: The Model (Logistic Regression, RandomForestClassifier, GradientBoostingClassifier, SVM)
45
46
## Run the below code (line 47 to 91) in one chunk
47
48
# Prepare the classifiers
49
classifiers = {
50
    "Logistic Regression": LogisticRegression(),
51
    "Random Forest": RandomForestClassifier(random_state=42),
52
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
53
    "Support Vector Machine": SVC(probability=True, random_state=42)
54
}
55
# Train and evaluate the classifiers
56
results = {}
57
for name, clf in classifiers.items():
58
    # Train the classifier
59
    clf.fit(X_train_scaled, y_train)
60
    # Predict on the test set
61
    y_pred = clf.predict(X_test_scaled)
62
    y_proba = clf.predict_proba(X_test_scaled)[:, 1]
63
    # Evaluate the classifier
64
    accuracy = accuracy_score(y_test, y_pred)
65
    precision = precision_score(y_test, y_pred)
66
    recall = recall_score(y_test, y_pred)
67
    f1 = f1_score(y_test, y_pred)
68
    roc_auc = roc_auc_score(y_test, y_proba)
69
    cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)
70
    # Store results
71
    results[name] = {
72
        "Accuracy": accuracy,
73
        "Precision": precision,
74
        "Recall": recall,
75
        "F1 Score": f1,
76
        "ROC-AUC Score": roc_auc,
77
        "Cross-validation scores": cv_scores.tolist(),  # Convert to list for printing
78
79
    }
80
    # Print the performance
81
    print(f"Results for {name}:")
82
    print(f"Accuracy: {accuracy:.4f}")
83
    print(f"Precision: {precision:.4f}")
84
    print(f"Recall: {recall:.4f}")
85
    print(f"F1 Score: {f1:.4f}")
86
    print(f"ROC-AUC Score: {roc_auc:.4f}\n")
87
    print(f"Cross-validation scores: {cv_scores}\n")
88
89
# Compare results
90
results_df = pd.DataFrame(results).transpose()
91
print("Comparison of Classifiers:")
92
print(results_df)
93
94
## Finding out the most important feature?
95
96
# Feature importance
97
# Train the classifier
98
clf =GradientBoostingClassifier(random_state=42)
99
clf.fit(X_train_prepared, y_train)
100
101
feature_importance = clf.feature_importances_
102
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]]
103
print("Most Important Feature:", important_features[0])
104
105
# 6.2 Exclude the best feature and retrain the classifier
106
X_train_subset = X_train_prepared.drop(important_features[0], axis=1)
107
X_test_subset = X_test_prepared.drop(important_features[0], axis=1)
108
clf_subset = RandomForestClassifier(random_state=42)
109
clf_subset.fit(X_train_subset, y_train)
110
111
y_pred_test_subset = clf_subset.predict(X_test_subset)
112
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset))
113
114
# Train the classifier
115
clf =RandomForestClassifier(random_state=42)
116
clf.fit(X_train_prepared, y_train)
117
118
feature_importance = clf.feature_importances_
119
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]]
120
121
print("Most Important Feature:", important_features[0])
122
123
# 6.2 Exclude the best feature and retrain the classifier
124
X_train_subset = X_train_prepared.drop(important_features[0], axis=1)
125
X_test_subset = X_test_prepared.drop(important_features[0], axis=1)
126
127
clf_subset = RandomForestClassifier(random_state=42)
128
clf_subset.fit(X_train_subset, y_train)
129
130
y_pred_test_subset = clf_subset.predict(X_test_subset)
131
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset))