## Preparing libraries

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,KFold,StratifiedKFold
import torch
os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID'
os.environ['CUDA_VISIBLE_DEVICES']='4,5,6,7'                        
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
from sklearn import preprocessing

from torch.utils.data import TensorDataset, DataLoader,Dataset
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torch.optim as optim
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import time 
import tqdm
import random
from PIL import Image
train_on_gpu = True
from torch.utils.data.sampler import SubsetRandomSampler
from torch.optim.lr_scheduler import StepLR, ReduceLROnPlateau, CosineAnnealingLR
from matplotlib.image import imread
from torch.utils import data
import seaborn as sns

from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc,roc_auc_score,classification_report, confusion_matrix
from sklearn.preprocessing import label_binarize, StandardScaler
from sklearn.multiclass import OneVsRestClassifier, unique_labels
from scipy import interp
from utils import print_confusion_matrix

from sklearn.utils import shuffle
import albumentations
from albumentations import torch as AT
import scipy.special

from pytorchcv.model_provider import get_model as ptcv_get_model

cudnn.benchmark = True
import warnings
warnings.filterwarnings("ignore")

from torch.utils.data import WeightedRandomSampler
from processing_pytorch import generate_dataset_grade, CancerDataset

## define constant

In [None]:
INPUT_SHAPE = 224 ##for attention network

In [None]:
SEED = 323
def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ['PYHTONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything(SEED)

In [None]:
main_path = '.'

## Preparing data

In [None]:
X_train, y_train, train_paths = generate_dataset_grade(main_path,os.path.join(r'.\\','msi_data_train_grade.csv'))

In [None]:
X_test, y_test, test_paths = generate_dataset_grade(main_path,os.path.join(r'.\\','msi_data_test_grade.csv'))

In [None]:
X_valid, y_valid, valid_paths = generate_dataset_grade(main_path,os.path.join(r'.\\','msi_data_valid_grade.csv'))

### balance the training set by downsampling

In [None]:
index_h =list(np.where(y_train == 'non-dysplasia')[0])
index_lg =list(np.where(y_train == 'low grade')[0])
index_hg =list(np.where(y_train == 'high grade')[0])
max_len = len(index_hg)
len_h = len(index_h)
len_lg = len(index_lg)

balanced_X_train = np.concatenate((np.array([x for i,x in enumerate(X_train) if i in index_h])[np.random.randint(0,len_h,max_len)], np.array([x for i,x in enumerate(X_train) if i in index_lg])[np.random.randint(0,len_lg,max_len)],np.array([x for i,x in enumerate(X_train) if i in index_hg])))
balanced_y_train = np.array(['non-dysplasia']*max_len + ['low grade']*max_len  + ['high grade']*max_len)
balanced_X_train,balanced_y_train = shuffle(balanced_X_train,balanced_y_train,random_state=SEED)

In [None]:
lb = preprocessing.LabelBinarizer()
bin_balanced_y_train = lb.fit_transform(balanced_y_train)
bin_y_test = lb.transform(y_test)
bin_y_val = lb.transform(y_valid)

In [None]:
balanced_X_train=balanced_X_train.astype(np.uint8)

In [None]:
balanced_y_train=balanced_y_train.astype(np.uint8)

## Model

In [None]:
data_transforms = albumentations.Compose([
    albumentations.Resize(INPUT_SHAPE, INPUT_SHAPE),
    albumentations.RandomRotate90(p=0.5),
    albumentations.Transpose(p=0.5),
    albumentations.Flip(p=0.5),
    albumentations.OneOf([
        albumentations.CLAHE(clip_limit=2), albumentations.IAASharpen(), albumentations.IAAEmboss(), 
        albumentations.RandomBrightness(), albumentations.RandomContrast(),
        albumentations.JpegCompression(), albumentations.Blur(), albumentations.GaussNoise()], p=0.5), 
    albumentations.HueSaturationValue(p=0.5), 
    albumentations.ShiftScaleRotate(shift_limit=0.15, scale_limit=0.15, rotate_limit=45, p=0.5),
    albumentations.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    AT.ToTensor()
    ])

data_transforms_test = albumentations.Compose([
    albumentations.Resize(INPUT_SHAPE, INPUT_SHAPE),
    albumentations.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    AT.ToTensor()
    ])

data_transforms_tta0 = albumentations.Compose([
    albumentations.Resize(INPUT_SHAPE, INPUT_SHAPE),
    albumentations.RandomRotate90(p=0.5),
    albumentations.Transpose(p=0.5),
    albumentations.Flip(p=0.5),
    albumentations.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    AT.ToTensor()
    ])

data_transforms_tta1 = albumentations.Compose([
    albumentations.Resize(INPUT_SHAPE, INPUT_SHAPE),
    albumentations.RandomRotate90(p=1),
    albumentations.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    AT.ToTensor()
    ])

data_transforms_tta2 = albumentations.Compose([
    albumentations.Resize(INPUT_SHAPE, INPUT_SHAPE),
    albumentations.Transpose(p=1),
    albumentations.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    AT.ToTensor()
    ])

data_transforms_tta3 = albumentations.Compose([
    albumentations.Resize(INPUT_SHAPE, INPUT_SHAPE),
    albumentations.Flip(p=1),
    albumentations.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    AT.ToTensor()
    ])

dataset = CancerDataset(balanced_X_train, balanced_y_train,  transform=data_transforms)
test_set = CancerDataset(X_test, y_test,  transform=data_transforms_test)
val_set = CancerDataset(X_val, y_val,  transform=data_transforms_test)
batch_size = 16
num_workers = 0
# prepare data loaders (combine dataset and sampler)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=None, num_workers=num_workers)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, sampler=None, num_workers=num_workers)
valid_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size, sampler=None, num_workers=num_workers)

## Training

In [None]:
model_conv = ptcv_get_model("cbam_resnet50", pretrained=True)
model_conv.output = nn.Linear(in_features=2048, out_features=3, bias=True)

In [None]:
model_conv.cuda()
criterion = nn.CrossEntropyLoss() #weight=class_weights.to(device)

optimizer = optim.Adam(model_conv.parameters(), lr=0.0001)

scheduler = StepLR(optimizer, 5, gamma=0.2)
scheduler.step()

In [None]:
if torch.cuda.device_count() > 1:
    print(torch.cuda.device_count() )
    model_conv = nn.DataParallel(model_conv,device_ids=[0,1,2,3])
model_conv.to(device)

In [None]:
def multi_acc(y_pred, y_test):
    y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
    _, y_pred_tags = torch.max(y_pred_softmax, dim = 1)    
    
    correct_pred = (y_pred_tags == y_test).float()
    acc = correct_pred.sum() / len(correct_pred)
    
    acc = torch.round(acc) 
    
    return acc

In [None]:
EPOCHS=20
train_acc_max = 0
print("Begin training.")
for e in tqdm.tqdm(range(1, EPOCHS+1)):
    
    # TRAINING
    train_epoch_loss = 0
    train_epoch_acc = 0
    model_conv.train()
    for tr_batch_i ,(X_train_batch, y_train_batch)  in enumerate(train_loader):
        X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
        optimizer.zero_grad()
        
        y_train_pred = model_conv(X_train_batch)
        
        y_train_batch = y_train_batch.argmax(1)
        
        train_loss = criterion(y_train_pred, y_train_batch)
        train_acc = multi_acc(y_train_pred, y_train_batch)
        
        train_loss.backward()
        optimizer.step()
        
        train_epoch_loss += train_loss.item()
        train_epoch_acc += train_acc.item()
        
        
        # TESTING   
        if (tr_batch_i+1)%600 == 0: 
            with torch.no_grad():

                test_epoch_loss = 0
                test_epoch_acc = 0

                model_conv.eval()
                for X_test_batch, y_test_batch in test_loader:
                    X_test_batch, y_test_batch = X_test_batch.to(device), y_test_batch.to(device)

                    y_test_pred = model_conv(X_test_batch)

                    y_test_batch = y_test_batch.argmax(1)

                    test_loss = criterion(y_test_pred, y_test_batch)
                    test_acc = multi_acc(y_test_pred, y_test_batch)

                    test_epoch_loss += test_loss.item()
                    test_epoch_acc += test_acc.item()

                print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Test Loss: {test_epoch_loss/len(test_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Test Acc: {test_epoch_acc/len(test_loader):.3f}')

                test_acc = test_epoch_acc/len(test_loader)
                train_acc = train_epoch_acc/len(train_loader)
                if train_acc > train_acc_max:
                    print('Training accuracy increased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                    train_acc_max,
                    train_acc))
                    torch.save(model_conv.state_dict(), r".\models_attention-grade-classif//model_epoch_{}_test_{:.4f}.pt".format(e ,(test_acc*100)))
                    train_acc_max = train_acc

## Generate DL features tables

In [None]:
model_conv.eval()

In [None]:
saved_dict = torch.load(r'.') #load weights
model_conv.load_state_dict(saved_dict)

In [None]:
new_classifier = nn.Sequential(*list(model_conv.children())[-1].features).cpu()

In [None]:
data_transforms = albumentations.Compose([
    albumentations.Resize(INPUT_SHAPE, INPUT_SHAPE),
    albumentations.Normalize(mean=[0.485, 0.456, 0.406],std=[0.229, 0.224, 0.225]),
    AT.ToTensor()
    ])

In [None]:
train_features = {}
for i,temp_X in tqdm.tqdm(enumerate(X_train)):
    tensor_X=data_transforms(image=temp_X)
    tensor_X=tensor_X["image"]
    tensor_X.unsqueeze_(0)
    output=torch.flatten(new_classifier(tensor_X)).detach().numpy()
    train_features[train_paths[i]]=output.flatten()
train_features=pd.DataFrame.from_dict(train_features)
train_features=train_features.T
train_features.to_csv(r".\train_features_grade_he.csv")

In [None]:
test_features = {}
for i,temp_X in tqdm.tqdm(enumerate(X_test)):
    tensor_X=data_transforms(image=temp_X)
    tensor_X=tensor_X["image"]
    tensor_X.unsqueeze_(0)
    output=torch.flatten(new_classifier(tensor_X)).detach().numpy()
    test_features[test_paths[i]]=output.flatten()
test_features=pd.DataFrame.from_dict(test_features)
test_features=test_features.T
test_features.to_csv(r".\test_features_grade_he.csv")

In [None]:
valid_features = {}
for i,temp_X in tqdm.tqdm(enumerate(X_val)):
    tensor_X=data_transforms(image=temp_X)
    tensor_X=tensor_X["image"]
    tensor_X.unsqueeze_(0)
    output=torch.flatten(new_classifier(tensor_X)).detach().numpy()
    valid_features[val_paths[i]]=output.flatten()
valid_features=pd.DataFrame.from_dict(valid_features)
valid_features=valid_features.T
valid_features.to_csv(r".\valid_features_grade_he.csv")

## TTA inference

In [None]:
NUM_TTA = 10

In [None]:
sigmoid = lambda x: scipy.special.expit(x)

In [None]:
def def_tta(X_data,y_data):
    for num_tta in range(NUM_TTA):
        if num_tta==0:
            test_set = CancerDataset(X_data, y_data,  transform=data_transforms_test)
            test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)
        elif num_tta==1:
            test_set = CancerDataset(X_data, y_data,  transform=data_transforms_tta1)
            test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)
        elif num_tta==2:
            test_set = CancerDataset(X_data, y_data,  transform=data_transforms_tta2)
            test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)
        elif num_tta==3:
            test_set = CancerDataset(X_data, y_data,  transform=data_transforms_tta3)
            test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)
        elif num_tta<8:
            test_set = CancerDataset(X_data, y_data,  transform=data_transforms_tta0)
            test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)
        else:
            test_set = CancerDataset(X_data, y_data,  transform=data_transforms)
            test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, num_workers=num_workers)

        preds = []
        for batch_i, (data, target) in enumerate(test_loader):
            data, target = data.cuda(), target.cuda()
            output = model_conv(data).detach()
            pr = output.cpu().numpy()
            for i in pr:
                preds.append(sigmoid(i)/NUM_TTA)
        
        array_pred = np.array([item for sublist in preds for item in sublist ]).reshape(-1,3)
        
        
        if num_tta==0:
            test_preds = pd.DataFrame({'imgs': test_set.image_files_list, 'preds 0': array_pred[:,0], 'preds 1': array_pred[:,1], 'preds 2': array_pred[:,2]})
            test_preds['imgs'] = test_preds['imgs'].apply(lambda x: x.split('.')[0])
        else:
            test_preds['preds 0']=+array_pred[:,0]
            test_preds['preds 1']=+array_pred[:,1]
            test_preds['preds 2']=+array_pred[:,2]
            
        print(num_tta)
    return(test_preds)


In [None]:
test_preds = def_tta(X_test,y_test)
valid_preds = def_tta(X_val,y_val)

## save results

In [None]:
df_test_preds =  pd.DataFrame(test_preds)
df_test_preds.to_csv(r'.\sub_10_tta_test.csv', index=False)
df_test_labels = pd.DataFrame(y_test)
df_test_labels.to_csv(r'.\labels_test.csv', index=False)
flatten_test_preds = np.array(test_preds[['preds 0','preds 1','preds 2']]).ravel()
np.savetxt(r'.\sub_10_tta_test_flatten.txt',flatten_test_preds)
flatten_y_preds = bin_y_test.ravel()
np.savetxt(r'.\labels_test_flatten.txt',flatten_y_preds)

In [None]:
df_valid_preds =  pd.DataFrame(valid_preds)
df_valid_preds.to_csv(r'.\sub_10_tta_valid.csv', index=False)
df_valid_labels = pd.DataFrame(y_valid)
df_valid_labels.to_csv(r'.\labels_valid.csv', index=False)
flatten_valid_preds = np.array(valid_preds[['preds 0','preds 1','preds 2']]).ravel()
np.savetxt(r'.\sub_10_tta_valid_flatten.txt',flatten_valid_preds)
flatten_y_preds = np.array(bin_y_val).ravel()
np.savetxt(r'.\labels_valid_flatten.txt',flatten_y_preds)