--- a +++ b/RSNA_ResNet50_model.py @@ -0,0 +1,441 @@ +#!/usr/bin/env python +# coding: utf-8 + +# ## ResNet50 model +# +# **Due to GPU quota is only 30 hours/per week on Kaggle, each training need 15+ hours, so the notebook cann't commiting(otherwise will exceeding the quota), only download the csv files to submit** +# +# + + +import numpy as np +import pandas as pd +import pydicom +import os +import matplotlib.pyplot as plt +import collections +from tqdm import tqdm_notebook as tqdm +from datetime import datetime + +from math import ceil, floor, log +import cv2 + +import tensorflow as tf +import keras + +import sys + +from keras_applications.resnet import ResNet50 + +from sklearn.model_selection import ShuffleSplit + + +# ### Model Parameters Setup + +# Paths +Path = '../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/' +train_img_path = Path + 'stage_2_train/' +test_img_path = Path + 'stage_2_test/' +sample_csv = Path + "stage_2_sample_submission.csv" +train_csv = Path + 'stage_2_train.csv' + + +def read_testset(filename): + df = pd.read_csv(filename) + df["Image"] = df["ID"].str.slice(stop=12) + df["Diagnosis"] = df["ID"].str.slice(start=13) + + df = df.loc[:, ["Label", "Diagnosis", "Image"]] + df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1) + + return df + + + +def read_trainset(filename = Path + "stage_2_train.csv"): + df = pd.read_csv(filename) + df["Image"] = df["ID"].str.slice(stop=12) + df["Diagnosis"] = df["ID"].str.slice(start=13) + duplicates_to_remove = [56346, 56347, 56348, 56349, + 56350, 56351, 1171830, 1171831, + 1171832, 1171833, 1171834, 1171835, + 3705312, 3705313, 3705314, 3705315, + 3705316, 3705317, 3842478, 3842479, + 3842480, 3842481, 3842482, 3842483 ] + df = df.drop(index = duplicates_to_remove) + df = df.reset_index(drop = True) + df = df.loc[:, ["Label", "Diagnosis", "Image"]] + df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1) + return df + + + +test_df = read_testset(sample_csv) +train_df = read_trainset(train_csv) + + +# ### Data EDA and Cleaning + +def correct_dcm(dcm): + x = dcm.pixel_array + 1000 + px_mode = 4096 + x[x>=px_mode] = x[x>=px_mode] - px_mode + dcm.PixelData = x.tobytes() + dcm.RescaleIntercept = -1000 + +def window_image(dcm, window_center, window_width): + + if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -1000): + correct_dcm(dcm) + + img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept + img_min = window_center - window_width // 2 + img_max = window_center + window_width // 2 + img = np.clip(img, img_min, img_max) + + return img + +def bsb_window(dcm): + brain_img = window_image(dcm, 40, 80) + subdural_img = window_image(dcm, 80, 200) + soft_img = window_image(dcm, 40, 380) + + brain_img = (brain_img - 0) / 80 + subdural_img = (subdural_img - (-20)) / 200 + soft_img = (soft_img - (-150)) / 380 + bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1,2,0) + + return bsb_img + + +def window_with_correction(dcm, window_center, window_width): + if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100): + correct_dcm(dcm) + img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept + img_min = window_center - window_width // 2 + img_max = window_center + window_width // 2 + img = np.clip(img, img_min, img_max) + return img + +def window_without_correction(dcm, window_center, window_width): + img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept + img_min = window_center - window_width // 2 + img_max = window_center + window_width // 2 + img = np.clip(img, img_min, img_max) + return img + +def window_testing(img, window): + brain_img = window(img, 40, 80) + subdural_img = window(img, 80, 200) + soft_img = window(img, 40, 380) + + brain_img = (brain_img - 0) / 80 + subdural_img = (subdural_img - (-20)) / 200 + soft_img = (soft_img - (-150)) / 380 + bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1,2,0) + + return bsb_img + + +# example of a "bad data point" (i.e. (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100) == True) +example_img = train_img_path + train_df.index[102] + ".dcm" + +dicom = pydicom.dcmread(example_img) + +fig, ax = plt.subplots(1, 2) + +ax[0].imshow(window_testing(dicom, window_without_correction), cmap=plt.cm.bone); +ax[0].set_title("original") +ax[1].imshow(window_testing(dicom, window_with_correction), cmap=plt.cm.bone); +ax[1].set_title("corrected"); + + +# ### Load Data + + +def _read(path, desired_size): + """Will be used in DataGenerator""" + + dcm = pydicom.dcmread(path) + + try: + img = bsb_window(dcm) + except: + img = np.zeros(desired_size) + + + img = cv2.resize(img, desired_size[:2], interpolation=cv2.INTER_LINEAR) + + return img + + +# ### Data generators +# +# Inherits from keras.utils.Sequence object and thus should be safe for multiprocessing. +# + + +class DataGenerator(keras.utils.Sequence): + + def __init__(self, list_IDs, labels=None, batch_size=1, img_size=(512, 512, 1), + img_dir=train_img_path, *args, **kwargs): + + self.list_IDs = list_IDs + self.labels = labels + self.batch_size = batch_size + self.img_size = img_size + self.img_dir = img_dir + self.on_epoch_end() + + def __len__(self): + return int(ceil(len(self.indices) / self.batch_size)) + + def __getitem__(self, index): + indices = self.indices[index*self.batch_size:(index+1)*self.batch_size] + list_IDs_temp = [self.list_IDs[k] for k in indices] + + if self.labels is not None: + X, Y = self.__data_generation(list_IDs_temp) + return X, Y + else: + X = self.__data_generation(list_IDs_temp) + return X + + def on_epoch_end(self): + + + if self.labels is not None: # for training phase we undersample and shuffle + # keep probability of any=0 and any=1 + keep_prob = self.labels.iloc[:, 0].map({0: 0.35, 1: 0.5}) + keep = (keep_prob > np.random.rand(len(keep_prob))) + self.indices = np.arange(len(self.list_IDs))[keep] + np.random.shuffle(self.indices) + else: + self.indices = np.arange(len(self.list_IDs)) + + def __data_generation(self, list_IDs_temp): + X = np.empty((self.batch_size, *self.img_size)) + + if self.labels is not None: # training phase + Y = np.empty((self.batch_size, 6), dtype=np.float32) + + for i, ID in enumerate(list_IDs_temp): + X[i,] = _read(self.img_dir+ID+".dcm", self.img_size) + Y[i,] = self.labels.loc[ID].values + + return X, Y + + else: # test phase + for i, ID in enumerate(list_IDs_temp): + X[i,] = _read(self.img_dir+ID+".dcm", self.img_size) + + return X + + +# ### Metrics + +from keras import backend as K + +def weighted_log_loss(y_true, y_pred): + """ + Can be used as the loss function in model.compile() + --------------------------------------------------- + """ + + class_weights = np.array([2., 1., 1., 1., 1., 1.]) + + eps = K.epsilon() + + y_pred = K.clip(y_pred, eps, 1.0-eps) + + out = -( y_true * K.log( y_pred) * class_weights + + (1.0 - y_true) * K.log(1.0 - y_pred) * class_weights) + + return K.mean(out, axis=-1) + + +def _normalized_weighted_average(arr, weights=None): + """ + A simple Keras implementation that mimics that of + numpy.average(), specifically for this competition + """ + + if weights is not None: + scl = K.sum(weights) + weights = K.expand_dims(weights, axis=1) + return K.sum(K.dot(arr, weights), axis=1) / scl + return K.mean(arr, axis=1) + + +def weighted_loss(y_true, y_pred): + """ + Will be used as the metric in model.compile() + --------------------------------------------- + + Similar to the custom loss function 'weighted_log_loss()' above + but with normalized weights, which should be very similar + to the official competition metric: + https://www.kaggle.com/kambarakun/lb-probe-weights-n-of-positives-scoring + and hence: + sklearn.metrics.log_loss with sample weights + """ + + class_weights = K.variable([2., 1., 1., 1., 1., 1.]) + + eps = K.epsilon() + + y_pred = K.clip(y_pred, eps, 1.0-eps) + + loss = -( y_true * K.log( y_pred) + + (1.0 - y_true) * K.log(1.0 - y_pred)) + + loss_samples = _normalized_weighted_average(loss, class_weights) + + return K.mean(loss_samples) + + +def weighted_log_loss_metric(trues, preds): + """ + Will be used to calculate the log loss + of the validation set in PredictionCheckpoint() + ------------------------------------------ + """ + class_weights = [2., 1., 1., 1., 1., 1.] + + epsilon = 1e-7 + + preds = np.clip(preds, epsilon, 1-epsilon) + loss = trues * np.log(preds) + (1 - trues) * np.log(1 - preds) + loss_samples = np.average(loss, axis=1, weights=class_weights) + + return - loss_samples.mean() + + +# ### Model +# +# + + +class PredictionCheckpoint(keras.callbacks.Callback): + + def __init__(self, test_df, valid_df, + test_images_dir=test_img_path, + valid_images_dir=train_img_path, + batch_size=32, input_size=(224, 224, 3)): + + self.test_df = test_df + self.valid_df = valid_df + self.test_images_dir = test_images_dir + self.valid_images_dir = valid_images_dir + self.batch_size = batch_size + self.input_size = input_size + + def on_train_begin(self, logs={}): + self.test_predictions = [] + self.valid_predictions = [] + + def on_epoch_end(self,batch, logs={}): + self.test_predictions.append( + self.model.predict_generator( + DataGenerator(self.test_df.index, None, self.batch_size, self.input_size, self.test_images_dir), verbose=2)[:len(self.test_df)]) + + + +class ResNet50_Model: + + def __init__(self, engine, input_dims, batch_size=5, num_epochs=4, learning_rate=1e-3, + decay_rate=1e-6, decay_steps=1, weights="imagenet", verbose=1): + + self.engine = engine + self.input_dims = input_dims + self.batch_size = batch_size + self.num_epochs = num_epochs + self.learning_rate = learning_rate + self.decay_rate = decay_rate + self.decay_steps = decay_steps + self.weights = weights + self.verbose = verbose + self._build() + + def _build(self): + + + engine = self.engine(include_top=False, weights=self.weights, input_shape=self.input_dims, + backend = keras.backend, layers = keras.layers, + models = keras.models, utils = keras.utils) + + x = keras.layers.GlobalAveragePooling2D(name='avg_pool')(engine.output) + out = keras.layers.Dense(6, activation="sigmoid", name='dense_output')(x) + + self.model = keras.models.Model(inputs=engine.input, outputs=out) + + self.model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(), metrics=[weighted_loss]) + + + def fit_and_predict(self, train_df, valid_df, test_df): + + # callbacks + pred_history = PredictionCheckpoint(test_df, valid_df, input_size=self.input_dims) + scheduler = keras.callbacks.LearningRateScheduler(lambda epoch: self.learning_rate * pow(self.decay_rate, floor(epoch / self.decay_steps))) + + self.model.fit_generator( + DataGenerator( + train_df.index, + train_df, + self.batch_size, + self.input_dims, + train_img_path + ), + epochs=self.num_epochs, + verbose=self.verbose, + use_multiprocessing=True, + workers=4, + callbacks=[pred_history, scheduler] + ) + + return pred_history + + def save(self, path): + self.model.save_weights(path) + + def load(self, path): + self.model.load_weights(path) + + +# ### Train model and predict +# +# +# + + +# Train/Valid/Test Split +train_df_ss = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42).split(train_df.index) + + +train_idx, valid_idx = next(train_df_ss) + +# model +model = ResNet50_Model(engine=ResNet50, input_dims=(224, 224, 3), batch_size=32, learning_rate=5e-4, + num_epochs=5, decay_rate=0.8, decay_steps=1, weights="imagenet", verbose=1) + +#predictions +history = model.fit_and_predict(train_df.iloc[train_idx], train_df.iloc[valid_idx], test_df) + + +# ### Ensemble and average all submission_predictions. + +test_df.iloc[:, :] = np.average(history.test_predictions, axis=0, weights=[0, 1, 2, 4, 6]) # let's do a weighted average for epochs (>1) +test_df = test_df.stack().reset_index() +test_df.insert(loc=0, column='ID', value=test_df['Image'].astype(str) + "_" + test_df['Diagnosis']) +test_df = test_df.drop(["Image", "Diagnosis"], axis=1) +test_df.to_csv('submission.csv', index=False) + + +from IPython.display import FileLink, FileLinks +FileLink('submission.csv') + + + + + +