Predictive-ML-Clinical / Git / [efae2c] /ML_Models

Models:
RaymondKing/
Predictive-ML-Clinical
Downloads: 1
[efae2c]: / ML_Models_Pipeline.py
History
Download this file
132 lines (107 with data), 4.7 kB

# Clustering Analysis

##1 Hierarchical Clustering

plt.figure(figsize=(15, 10))
linkage_matrix = linkage(X_train_outliers_removed, method='ward')
dendrogram(linkage_matrix)
plt.title('Hierarchical Clustering Dendrogram')
plt.show()
k = 2
clusters = fcluster(linkage_matrix, k, criterion='maxclust')

X_train_scaled

##2 K-means Clustering

num_clusters = 2
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_train_prepared)
# Assign cluster labels to the original data
X_train['cluster'] = kmeans.labels_
# Combine the cluster labels with the original training set
patient_df_with_clusters = patient_df.loc[X_train.index].copy()
patient_df_with_clusters['cluster'] = X_train['cluster']

# Explore characteristics of each cluster
for cluster_label in range(num_clusters):
    cluster_data = patient_df_with_clusters[patient_df_with_clusters['cluster'] == cluster_label]
    print(f'\nCluster {cluster_label} Characteristics:')
    print(cluster_data.describe())

# Calculate the mean of each feature for each cluster
# Exclude non-numeric columns from the calculation
cluster_means = patient_df_with_clusters.select_dtypes(include=['number']).groupby('cluster').mean()
print("\nMean Value of Features for Each Cluster:")
print(cluster_means)

plt.scatter(X_train_pca[:, 0], X_train_pca[:, 1], c=kmeans.labels_, cmap='viridis')
plt.title('Clustering Analysis')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.show()

# Classifier: The Model (Logistic Regression, RandomForestClassifier, GradientBoostingClassifier, SVM)

## Run the below code (line 47 to 91) in one chunk

# Prepare the classifiers
classifiers = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "Support Vector Machine": SVC(probability=True, random_state=42)
}
# Train and evaluate the classifiers
results = {}
for name, clf in classifiers.items():
    # Train the classifier
    clf.fit(X_train_scaled, y_train)
    # Predict on the test set
    y_pred = clf.predict(X_test_scaled)
    y_proba = clf.predict_proba(X_test_scaled)[:, 1]
    # Evaluate the classifier
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_proba)
    cv_scores = cross_val_score(clf, X_train_scaled, y_train, cv=5)
    # Store results
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1,
        "ROC-AUC Score": roc_auc,
        "Cross-validation scores": cv_scores.tolist(),  # Convert to list for printing

    }
    # Print the performance
    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC-AUC Score: {roc_auc:.4f}\n")
    print(f"Cross-validation scores: {cv_scores}\n")

# Compare results
results_df = pd.DataFrame(results).transpose()
print("Comparison of Classifiers:")
print(results_df)

## Finding out the most important feature?

# Feature importance
# Train the classifier
clf =GradientBoostingClassifier(random_state=42)
clf.fit(X_train_prepared, y_train)

feature_importance = clf.feature_importances_
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]]
print("Most Important Feature:", important_features[0])

# 6.2 Exclude the best feature and retrain the classifier
X_train_subset = X_train_prepared.drop(important_features[0], axis=1)
X_test_subset = X_test_prepared.drop(important_features[0], axis=1)
clf_subset = RandomForestClassifier(random_state=42)
clf_subset.fit(X_train_subset, y_train)

y_pred_test_subset = clf_subset.predict(X_test_subset)
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset))

# Train the classifier
clf =RandomForestClassifier(random_state=42)
clf.fit(X_train_prepared, y_train)

feature_importance = clf.feature_importances_
important_features = X_train_prepared.columns[np.argsort(feature_importance)[::-1]]

print("Most Important Feature:", important_features[0])

# 6.2 Exclude the best feature and retrain the classifier
X_train_subset = X_train_prepared.drop(important_features[0], axis=1)
X_test_subset = X_test_prepared.drop(important_features[0], axis=1)

clf_subset = RandomForestClassifier(random_state=42)
clf_subset.fit(X_train_subset, y_train)

y_pred_test_subset = clf_subset.predict(X_test_subset)
print("Test Accuracy (Excluding Best Feature):", accuracy_score(y_test, y_pred_test_subset))