In [1]:
import numpy as np
#from pyimzml.ImzMLParser import ImzMLParser
from tqdm import tqdm
import os
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc,classification_report
from utils import print_confusion_matrix, assemble_dataset_supervised_learning
from sklearn.utils import shuffle
from sklearn.svm import SVC
from itertools import product
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from itertools import cycle
from scipy import interp
from sklearn.calibration import calibration_curve

## load data

In [None]:
peaklist = np.array(pd.read_csv(r'.\\regions_peaklist_from_marta.txt', sep = " "))



path_data = r'.\msi_tables_filtered'
list_dataset = os.listdir(path_data)

##classification per tiles _ supervised 

labels = pd.read_csv('.\labels_frozen.txt',sep = ';' ) #table with slide;label;unified_label;image_name

full_dataset, y_labels = assemble_dataset_supervised_learning(labels,list_dataset,path_data, "grade")

## pre-process data per patient with box cox and 10**5 factor

In [None]:
dict_X_gauss = {}

pt = preprocessing.PowerTransformer(method='box-cox', standardize=False)
name_images = full_dataset[full_dataset["dataset_name"]=="SlideA1"]["image_name"]
temp_patient_data = full_dataset[full_dataset["dataset_name"]=="SlideA1"].drop(columns = ['dataset_name','image_name'])*10**5
X_gaus = pt.fit_transform(temp_patient_data)

columns = np.unique(full_dataset["dataset_name"])

for col in tqdm( columns[1:]):
    name_images = full_dataset[full_dataset["dataset_name"]==col]["image_name"]
    temp_patient_data = full_dataset[full_dataset["dataset_name"]==col].drop(columns = ['dataset_name','image_name'])*10**5
    array_trans = pt.fit_transform(temp_patient_data)
    X_gaus=np.concatenate((X_gaus,array_trans),axis =0)

In [None]:
X_train, X_test_and_valid, y_train, y_test_and_valid, data_train, data_test_and_valid = train_test_split(X_gaus,y_labels , full_dataset[["dataset_name",'image_name']],test_size = 0.30, random_state=10) 

In [None]:
#create test dataset
len_half = len(y_test_and_valid)//2
X_test = X_test_and_valid[:len_half]
data_test = data_test_and_valid[:len_half]
y_test = y_test_and_valid[:len_half]

In [None]:
#create validation dataset
X_valid = X_test_and_valid[len_half:]
data_valid = data_test_and_valid[len_half:]
y_valid = y_test_and_valid[len_half:]

## balancing training data

In [None]:
max_len = X_train[y_train == 'high grade'].shape[0]
len_h = X_train[y_train == 'non-dysplasia'].shape[0]
len_lg = X_train[y_train == 'low grade'].shape[0]

balanced_X_train = np.concatenate((X_train[y_train == 'non-dysplasia'][np.random.randint(0,len_h,max_len)], X_train[y_train == 'low grade'][np.random.randint(0,len_lg,max_len)],X_train[y_train == 'high grade']))
balanced_y_train = np.array(['non-dysplasia']*max_len + ['low grade']*max_len  + ['high grade']*X_train[y_train == 'highgrade'].shape[0])
balanced_X_train,balanced_y_train = shuffle(balanced_X_train,balanced_y_train)

## grid search for MLP

In [None]:
parameters = { 'batch_size':[32,64,128,356], 'alpha': 10.0 ** -np.arange(1, 10), 'hidden_layer_sizes':list(product(np.arange(10,21,10),np.arange(10,21,10)))}
mlp_model = GridSearchCV(MLPClassifier(solver='adam',max_iter = 1100), parameters, n_jobs=20, , cv= 5, verbose = 2)

mlp_model.fit(balanced_X_train,balanced_y_train)

## gridsearchCV for random forest

In [None]:
param_grid = { 
   'n_estimators': [100,200,500],
   'max_depth' : [4,8,16],
   'criterion' :['gini', 'entropy']
}

rf_model = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5, verbose=1,n_jobs=20)
rf_model.fit(balanced_X_train,balanced_y_train)


## gridsearchCV for XGBoost

In [None]:
params = {
   "min_child_weight":range(1,6,2),
   "gamma": uniform(0, 0.5),
   "learning_rate": uniform(0.03, 0.3),
   "max_depth": range(3,10,2), 
   "n_estimators": randint(100, 150),
   "subsample": uniform(0.6, 0.4)
}

xgb_model = GridSearchCV(estimator = xgb.XGBClassifier(colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = params, scoring='roc_auc',n_jobs=12,iid=False, cv=5,verbose=1)

xgb_model.fit(balanced_X_train,balanced_y_train)

## save feature importance

In [None]:
results=pd.DataFrame()
results['columns']=list(full_dataset.columns)[2:]
results['importances_rf'] = CV_rfc.feature_importances_
results['importances_xgboost'] = xgb1.feature_importances_
results['importances_mean'] = np.mean([xgb1.feature_importances_,CV_rfc.feature_importances_],axis=0)
results.sort_values(by='importances_mean',ascending=False,inplace=True)
results.to_excel(r".\features_rf_xgboost_msi_grade.xlsx",index=None)
other_results= pd.read_excel(r".\features_rf_xgboost_msi_gland_vs_tissue.xlsx")
other_results.sort_values(by='importances_mean',ascending=False,inplace=True)

## ensemble all best model

In [None]:
#ensemble all best model

vc = VotingClassifier(estimators=[
     ('mlp', mlp_model.best_estimator_), ('rf', rf_model.best_estimator_), ('xgb', xgb_model.best_estimator_)],
     voting='soft',n_jobs=12)
vc = vc.fit(balanced_X_train,balanced_y_train)