<a href="https://colab.research.google.com/github/Souhib-khalbous/Quantitative-Analysis-of-T2-Coronal-MRI-Data-for-Treatment-Efficiency-in-Uterine-Fibroids-/blob/master/2KNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Mount The Data**

In [None]:
#Why do we mount the data?
#------>Because we use Colab.

# Import the drive module from the google.colab library to enable Google Drive integration
from google.colab import drive
# Mount your Google Drive to the Colab VM to access files directly from your Drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Libraries**

In [None]:
import numpy as np                                                   # For numerical operations, such as array manipulations
import pandas as pd                                                  # For data manipulation and analysis, particularly with tables
import matplotlib.pyplot as plt                                      # For plotting graphs and visualizing data

import tensorflow as tf                                              # A comprehensive library for ML and DL

# sklearn for ML algorithms, model evaluation, and data preprocessing
from sklearn.neighbors import KNeighborsClassifier                   # Implementing the KNN algorithm for classification
from sklearn.model_selection import train_test_split                 # Splitting data into training and testing sets
from sklearn.preprocessing import StandardScaler                     # Correcting typo: Standardizing features by removing the mean and scaling to unit variance


# Metrics for evaluating model performance
from sklearn.metrics import confusion_matrix, classification_report  # For evaluating classification accuracy
from sklearn.metrics import f1_score, accuracy_score                 # Precision metrics
from sklearn.metrics import roc_auc_score, roc_curve, auc            # For evaluating the model's ability to distinguish between classes

# **Reading the Data**

In [None]:
# Specify the columns you want to use as features
feature_columns = ['LB', 'AC', 'FM', 'UC', 'DL', 'DS', 'DP', 'ASTV', 'MSTV', 'ALTV', 'MLTV', 'Width', 'Min', 'Max', 'Nmax', 'Nzeros', 'Mode', 'Mean', 'Median', 'Variance', 'Tendency']

# Read the 'Raw Data' sheet of the Excel file, selecting only the specified columns plus the 'CLASS' column
dataset = pd.read_excel('/content/drive/MyDrive/Cardiotocographic/CTG.xls', sheet_name='Raw Data', usecols=feature_columns + ['CLASS'])

#remove any row that has at least one NaN value
dataset = dataset.dropna()

#Reset the index of the DataFrame and drop the old index
dataset = dataset.reset_index(drop=True)


print(len(dataset))
print(dataset.head())

# Now you have a DataFrame 'dataset' with only the features you're interested in and the 'CLASS' target variable

2126
      LB   AC   FM   UC  ASTV  MSTV  ALTV  MLTV   DL   DS  ...   Min    Max  \
0  120.0  0.0  0.0  0.0  73.0   0.5  43.0   2.4  0.0  0.0  ...  62.0  126.0   
1  132.0  4.0  0.0  4.0  17.0   2.1   0.0  10.4  2.0  0.0  ...  68.0  198.0   
2  133.0  2.0  0.0  5.0  16.0   2.1   0.0  13.4  2.0  0.0  ...  68.0  198.0   
3  134.0  2.0  0.0  6.0  16.0   2.4   0.0  23.0  2.0  0.0  ...  53.0  170.0   
4  132.0  4.0  0.0  5.0  16.0   2.4   0.0  19.9  0.0  0.0  ...  53.0  170.0   

   Nmax  Nzeros   Mode   Mean  Median  Variance  Tendency  CLASS  
0   2.0     0.0  120.0  137.0   121.0      73.0       1.0    9.0  
1   6.0     1.0  141.0  136.0   140.0      12.0       0.0    6.0  
2   5.0     1.0  141.0  135.0   138.0      13.0       0.0    6.0  
3  11.0     0.0  137.0  134.0   137.0      13.0       1.0    6.0  
4   9.0     0.0  137.0  136.0   138.0      11.0       1.0    2.0  

[5 rows x 22 columns]


# **Splitting the Data**


*   Define X, y
*   Scale the Features (-1 < Features < +1)

*   10 Folds Cross-Validation Method

# This cell is to choose the best parameters for the KNN model.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score


# Features and target variable
X = dataset[feature_columns]
y = dataset['CLASS']          # Target variable

# Standardize the features (important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


# Initialize the list for storing results
results = []

# Define the range of parameters to test
k_values = range(1, 200, 2)  # Odd values between 1 and 199
weights_options = ['uniform', 'distance']
metric_options = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']

# Loop through all combinations of k, weights, and metrics
for k in k_values:
    for weights in weights_options:
        for metric in metric_options:
            # Initialize KNN with current configuration
            knn = KNeighborsClassifier(n_neighbors=k, weights=weights, metric=metric)

            # Perform 10-fold cross-validation and calculate the average score
            cv_scores = cross_val_score(knn, X_scaled, y, cv=10)
            avg_score = np.mean(cv_scores)

            # Store the results
            results.append({
                'k': k,
                'weights': weights,
                'metric': metric,
                'avg_score': avg_score
            })

# Identify the configuration with the highest average CV score
best_result = max(results, key=lambda x: x['avg_score'])

# Output the best configuration
print(f"Best Configuration:\nK (Number of Neighbors): {best_result['k']}\nWeights: {best_result['weights']}\nMetric: {best_result['metric']}\nAverage CV Score: {best_result['avg_score']:.4f}")




Best Configuration:
K (Number of Neighbors): 37
Weights: distance
Metric: manhattan
Average CV Score: 0.6735




In [None]:
print(X_scaled.shape)  # Should show (n_samples, n_features)
print(y.shape)         # Should show (n_samples,)


(906, 21)
(2126,)


In [None]:
from sklearn.model_selection import cross_val_score

# Features and target variable
X = dataset[feature_columns]
y = dataset['CLASS']          # Target variable

# Standardize the features (important for KNN)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create KNN model
knn = KNeighborsClassifier(n_neighbors=55, weights='distance', metric='manhattan')    # You can adjust the number of neighbors here "K"

# Apply 10-fold cross-validation(cv)
cv_scores = cross_val_score(knn, X_scaled, y, cv=10)

# Print out the mean cross-validation score
print("Average 10-Fold CV Score: ", np.mean(cv_scores))

# Optionally, print the scores for each fold
print("Scores for each fold: ", cv_scores)


ValueError: Found input variables with inconsistent numbers of samples: [906, 2126]

# **Evaluation**

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict
import seaborn as sns  # For plotting

# Assuming X_scaled, y, and the KNN model are defined as before

# Define the stratified K-fold cross-validator
cv = StratifiedKFold(n_splits=10)  #This approach is for more reliable prediction estimates.

# Generate cross-validated estimates for each input data point
predictions = cross_val_predict(knn, X_scaled, y, cv=cv)

# Confusion Matrix
conf_matrix = confusion_matrix(y, predictions)

# Plotting the Confusion Matrix
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=range(1, 11), yticklabels=range(1, 11))
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()



# Performance Metrics
accuracy = accuracy_score(y, predictions)
f1 = f1_score(y, predictions, average='weighted')  # Use weighted for multi-class classification
precision = precision_score(y, predictions, average='weighted')
recall = recall_score(y, predictions, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score (Weighted): {f1:.4f}")
print(f"Precision (Weighted): {precision:.4f}")
print(f"Recall (Weighted): {recall:.4f}")

# Detailed Classification Report
print("\nClassification Report:\n", classification_report(y, predictions))





ValueError: Found input variables with inconsistent numbers of samples: [906, 2126]