#!/usr/bin/env python
# coding: utf-8
# ## ResNet50 model
#
# **Due to GPU quota is only 30 hours/per week on Kaggle, each training need 15+ hours, so the notebook cann't commiting(otherwise will exceeding the quota), only download the csv files to submit**
#
#
import numpy as np
import pandas as pd
import pydicom
import os
import matplotlib.pyplot as plt
import collections
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
from math import ceil, floor, log
import cv2
import tensorflow as tf
import keras
import sys
from keras_applications.resnet import ResNet50
from sklearn.model_selection import ShuffleSplit
# ### Model Parameters Setup
# Paths
Path = '../input/rsna-intracranial-hemorrhage-detection/rsna-intracranial-hemorrhage-detection/'
train_img_path = Path + 'stage_2_train/'
test_img_path = Path + 'stage_2_test/'
sample_csv = Path + "stage_2_sample_submission.csv"
train_csv = Path + 'stage_2_train.csv'
def read_testset(filename):
df = pd.read_csv(filename)
df["Image"] = df["ID"].str.slice(stop=12)
df["Diagnosis"] = df["ID"].str.slice(start=13)
df = df.loc[:, ["Label", "Diagnosis", "Image"]]
df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1)
return df
def read_trainset(filename = Path + "stage_2_train.csv"):
df = pd.read_csv(filename)
df["Image"] = df["ID"].str.slice(stop=12)
df["Diagnosis"] = df["ID"].str.slice(start=13)
duplicates_to_remove = [56346, 56347, 56348, 56349,
56350, 56351, 1171830, 1171831,
1171832, 1171833, 1171834, 1171835,
3705312, 3705313, 3705314, 3705315,
3705316, 3705317, 3842478, 3842479,
3842480, 3842481, 3842482, 3842483 ]
df = df.drop(index = duplicates_to_remove)
df = df.reset_index(drop = True)
df = df.loc[:, ["Label", "Diagnosis", "Image"]]
df = df.set_index(['Image', 'Diagnosis']).unstack(level=-1)
return df
test_df = read_testset(sample_csv)
train_df = read_trainset(train_csv)
# ### Data EDA and Cleaning
def correct_dcm(dcm):
x = dcm.pixel_array + 1000
px_mode = 4096
x[x>=px_mode] = x[x>=px_mode] - px_mode
dcm.PixelData = x.tobytes()
dcm.RescaleIntercept = -1000
def window_image(dcm, window_center, window_width):
if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -1000):
correct_dcm(dcm)
img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
img_min = window_center - window_width // 2
img_max = window_center + window_width // 2
img = np.clip(img, img_min, img_max)
return img
def bsb_window(dcm):
brain_img = window_image(dcm, 40, 80)
subdural_img = window_image(dcm, 80, 200)
soft_img = window_image(dcm, 40, 380)
brain_img = (brain_img - 0) / 80
subdural_img = (subdural_img - (-20)) / 200
soft_img = (soft_img - (-150)) / 380
bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1,2,0)
return bsb_img
def window_with_correction(dcm, window_center, window_width):
if (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100):
correct_dcm(dcm)
img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
img_min = window_center - window_width // 2
img_max = window_center + window_width // 2
img = np.clip(img, img_min, img_max)
return img
def window_without_correction(dcm, window_center, window_width):
img = dcm.pixel_array * dcm.RescaleSlope + dcm.RescaleIntercept
img_min = window_center - window_width // 2
img_max = window_center + window_width // 2
img = np.clip(img, img_min, img_max)
return img
def window_testing(img, window):
brain_img = window(img, 40, 80)
subdural_img = window(img, 80, 200)
soft_img = window(img, 40, 380)
brain_img = (brain_img - 0) / 80
subdural_img = (subdural_img - (-20)) / 200
soft_img = (soft_img - (-150)) / 380
bsb_img = np.array([brain_img, subdural_img, soft_img]).transpose(1,2,0)
return bsb_img
# example of a "bad data point" (i.e. (dcm.BitsStored == 12) and (dcm.PixelRepresentation == 0) and (int(dcm.RescaleIntercept) > -100) == True)
example_img = train_img_path + train_df.index[102] + ".dcm"
dicom = pydicom.dcmread(example_img)
fig, ax = plt.subplots(1, 2)
ax[0].imshow(window_testing(dicom, window_without_correction), cmap=plt.cm.bone);
ax[0].set_title("original")
ax[1].imshow(window_testing(dicom, window_with_correction), cmap=plt.cm.bone);
ax[1].set_title("corrected");
# ### Load Data
def _read(path, desired_size):
"""Will be used in DataGenerator"""
dcm = pydicom.dcmread(path)
try:
img = bsb_window(dcm)
except:
img = np.zeros(desired_size)
img = cv2.resize(img, desired_size[:2], interpolation=cv2.INTER_LINEAR)
return img
# ### Data generators
#
# Inherits from keras.utils.Sequence object and thus should be safe for multiprocessing.
#
class DataGenerator(keras.utils.Sequence):
def __init__(self, list_IDs, labels=None, batch_size=1, img_size=(512, 512, 1),
img_dir=train_img_path, *args, **kwargs):
self.list_IDs = list_IDs
self.labels = labels
self.batch_size = batch_size
self.img_size = img_size
self.img_dir = img_dir
self.on_epoch_end()
def __len__(self):
return int(ceil(len(self.indices) / self.batch_size))
def __getitem__(self, index):
indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
list_IDs_temp = [self.list_IDs[k] for k in indices]
if self.labels is not None:
X, Y = self.__data_generation(list_IDs_temp)
return X, Y
else:
X = self.__data_generation(list_IDs_temp)
return X
def on_epoch_end(self):
if self.labels is not None: # for training phase we undersample and shuffle
# keep probability of any=0 and any=1
keep_prob = self.labels.iloc[:, 0].map({0: 0.35, 1: 0.5})
keep = (keep_prob > np.random.rand(len(keep_prob)))
self.indices = np.arange(len(self.list_IDs))[keep]
np.random.shuffle(self.indices)
else:
self.indices = np.arange(len(self.list_IDs))
def __data_generation(self, list_IDs_temp):
X = np.empty((self.batch_size, *self.img_size))
if self.labels is not None: # training phase
Y = np.empty((self.batch_size, 6), dtype=np.float32)
for i, ID in enumerate(list_IDs_temp):
X[i,] = _read(self.img_dir+ID+".dcm", self.img_size)
Y[i,] = self.labels.loc[ID].values
return X, Y
else: # test phase
for i, ID in enumerate(list_IDs_temp):
X[i,] = _read(self.img_dir+ID+".dcm", self.img_size)
return X
# ### Metrics
from keras import backend as K
def weighted_log_loss(y_true, y_pred):
"""
Can be used as the loss function in model.compile()
---------------------------------------------------
"""
class_weights = np.array([2., 1., 1., 1., 1., 1.])
eps = K.epsilon()
y_pred = K.clip(y_pred, eps, 1.0-eps)
out = -( y_true * K.log( y_pred) * class_weights
+ (1.0 - y_true) * K.log(1.0 - y_pred) * class_weights)
return K.mean(out, axis=-1)
def _normalized_weighted_average(arr, weights=None):
"""
A simple Keras implementation that mimics that of
numpy.average(), specifically for this competition
"""
if weights is not None:
scl = K.sum(weights)
weights = K.expand_dims(weights, axis=1)
return K.sum(K.dot(arr, weights), axis=1) / scl
return K.mean(arr, axis=1)
def weighted_loss(y_true, y_pred):
"""
Will be used as the metric in model.compile()
---------------------------------------------
Similar to the custom loss function 'weighted_log_loss()' above
but with normalized weights, which should be very similar
to the official competition metric:
https://www.kaggle.com/kambarakun/lb-probe-weights-n-of-positives-scoring
and hence:
sklearn.metrics.log_loss with sample weights
"""
class_weights = K.variable([2., 1., 1., 1., 1., 1.])
eps = K.epsilon()
y_pred = K.clip(y_pred, eps, 1.0-eps)
loss = -( y_true * K.log( y_pred)
+ (1.0 - y_true) * K.log(1.0 - y_pred))
loss_samples = _normalized_weighted_average(loss, class_weights)
return K.mean(loss_samples)
def weighted_log_loss_metric(trues, preds):
"""
Will be used to calculate the log loss
of the validation set in PredictionCheckpoint()
------------------------------------------
"""
class_weights = [2., 1., 1., 1., 1., 1.]
epsilon = 1e-7
preds = np.clip(preds, epsilon, 1-epsilon)
loss = trues * np.log(preds) + (1 - trues) * np.log(1 - preds)
loss_samples = np.average(loss, axis=1, weights=class_weights)
return - loss_samples.mean()
# ### Model
#
#
class PredictionCheckpoint(keras.callbacks.Callback):
def __init__(self, test_df, valid_df,
test_images_dir=test_img_path,
valid_images_dir=train_img_path,
batch_size=32, input_size=(224, 224, 3)):
self.test_df = test_df
self.valid_df = valid_df
self.test_images_dir = test_images_dir
self.valid_images_dir = valid_images_dir
self.batch_size = batch_size
self.input_size = input_size
def on_train_begin(self, logs={}):
self.test_predictions = []
self.valid_predictions = []
def on_epoch_end(self,batch, logs={}):
self.test_predictions.append(
self.model.predict_generator(
DataGenerator(self.test_df.index, None, self.batch_size, self.input_size, self.test_images_dir), verbose=2)[:len(self.test_df)])
class ResNet50_Model:
def __init__(self, engine, input_dims, batch_size=5, num_epochs=4, learning_rate=1e-3,
decay_rate=1e-6, decay_steps=1, weights="imagenet", verbose=1):
self.engine = engine
self.input_dims = input_dims
self.batch_size = batch_size
self.num_epochs = num_epochs
self.learning_rate = learning_rate
self.decay_rate = decay_rate
self.decay_steps = decay_steps
self.weights = weights
self.verbose = verbose
self._build()
def _build(self):
engine = self.engine(include_top=False, weights=self.weights, input_shape=self.input_dims,
backend = keras.backend, layers = keras.layers,
models = keras.models, utils = keras.utils)
x = keras.layers.GlobalAveragePooling2D(name='avg_pool')(engine.output)
out = keras.layers.Dense(6, activation="sigmoid", name='dense_output')(x)
self.model = keras.models.Model(inputs=engine.input, outputs=out)
self.model.compile(loss="binary_crossentropy", optimizer=keras.optimizers.Adam(), metrics=[weighted_loss])
def fit_and_predict(self, train_df, valid_df, test_df):
# callbacks
pred_history = PredictionCheckpoint(test_df, valid_df, input_size=self.input_dims)
scheduler = keras.callbacks.LearningRateScheduler(lambda epoch: self.learning_rate * pow(self.decay_rate, floor(epoch / self.decay_steps)))
self.model.fit_generator(
DataGenerator(
train_df.index,
train_df,
self.batch_size,
self.input_dims,
train_img_path
),
epochs=self.num_epochs,
verbose=self.verbose,
use_multiprocessing=True,
workers=4,
callbacks=[pred_history, scheduler]
)
return pred_history
def save(self, path):
self.model.save_weights(path)
def load(self, path):
self.model.load_weights(path)
# ### Train model and predict
#
#
#
# Train/Valid/Test Split
train_df_ss = ShuffleSplit(n_splits=10, test_size=0.1, random_state=42).split(train_df.index)
train_idx, valid_idx = next(train_df_ss)
# model
model = ResNet50_Model(engine=ResNet50, input_dims=(224, 224, 3), batch_size=32, learning_rate=5e-4,
num_epochs=5, decay_rate=0.8, decay_steps=1, weights="imagenet", verbose=1)
#predictions
history = model.fit_and_predict(train_df.iloc[train_idx], train_df.iloc[valid_idx], test_df)
# ### Ensemble and average all submission_predictions.
test_df.iloc[:, :] = np.average(history.test_predictions, axis=0, weights=[0, 1, 2, 4, 6]) # let's do a weighted average for epochs (>1)
test_df = test_df.stack().reset_index()
test_df.insert(loc=0, column='ID', value=test_df['Image'].astype(str) + "_" + test_df['Diagnosis'])
test_df = test_df.drop(["Image", "Diagnosis"], axis=1)
test_df.to_csv('submission.csv', index=False)
from IPython.display import FileLink, FileLinks
FileLink('submission.csv')