--- a +++ b/Notebook/Week 3/basic model.ipynb @@ -0,0 +1,809 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pickle\n", + "import random\n", + "import glob\n", + "import datetime\n", + "import pandas as pd\n", + "import numpy as np\n", + "import cv2\n", + "import pydicom\n", + "from tqdm import tqdm\n", + "from joblib import delayed, Parallel\n", + "import zipfile\n", + "from pydicom.filebase import DicomBytesIO\n", + "import sys\n", + "from PIL import Image\n", + "import cv2\n", + "\n", + "import keras\n", + "from keras.models import model_from_json\n", + "import tensorflow as tf\n", + "from keras.models import Sequential, Model\n", + "from keras.layers import Dense, Conv2D, Flatten, MaxPooling2D, GlobalAveragePooling2D\n", + "from keras.applications.inception_v3 import InceptionV3\n", + "\n", + "# importing pyplot and image from matplotlib \n", + "import matplotlib.pyplot as plt \n", + "import matplotlib.image as mpimg \n", + "\n", + "\n", + "from keras.preprocessing import image\n", + "import albumentations as A" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "base_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th>Diagnosis</th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " </tr>\n", + " <tr>\n", + " <th>ImageID</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>ID_000012eaf</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_000039fa0</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_00005679d</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_00008ce3c</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_0000950d7</th>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + "Diagnosis any epidural intraparenchymal intraventricular subarachnoid \\\n", + "ImageID \n", + "ID_000012eaf 0 0 0 0 0 \n", + "ID_000039fa0 0 0 0 0 0 \n", + "ID_00005679d 0 0 0 0 0 \n", + "ID_00008ce3c 0 0 0 0 0 \n", + "ID_0000950d7 0 0 0 0 0 \n", + "\n", + "Diagnosis subdural \n", + "ImageID \n", + "ID_000012eaf 0 \n", + "ID_000039fa0 0 \n", + "ID_00005679d 0 \n", + "ID_00008ce3c 0 \n", + "ID_0000950d7 0 " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df = pd.read_csv(f'{base_url}/stage_2_train.csv').drop_duplicates()\n", + "train_df['ImageID'] = train_df['ID'].str.slice(stop=12)\n", + "train_df['Diagnosis'] = train_df['ID'].str.slice(start=13)\n", + "train_labels = train_df.pivot(index=\"ImageID\", columns=\"Diagnosis\", values=\"Label\")\n", + "train_labels.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "train_metadata = pd.read_parquet(f'{base_url}/train_metadata.parquet.gzip')\n", + "test_metadata = pd.read_parquet(f'{base_url}/test_metadata.parquet.gzip')\n", + "\n", + "train_metadata[\"Dataset\"] = \"train\"\n", + "test_metadata[\"Dataset\"] = \"test\"\n", + "\n", + "train_metadata = train_metadata.join(train_labels)\n", + "\n", + "metadata = pd.concat([train_metadata, test_metadata], sort=True)\n", + "metadata.sort_values(by=\"ImagePositionPatient_2\", inplace=True, ascending=False)\n", + "metadata.drop(['ID_6431af929'],inplace = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>BitsAllocated</th>\n", + " <th>BitsStored</th>\n", + " <th>Columns</th>\n", + " <th>Dataset</th>\n", + " <th>HighBit</th>\n", + " <th>ImageOrientationPatient_0</th>\n", + " <th>ImageOrientationPatient_1</th>\n", + " <th>ImageOrientationPatient_2</th>\n", + " <th>ImageOrientationPatient_3</th>\n", + " <th>ImageOrientationPatient_4</th>\n", + " <th>...</th>\n", + " <th>StudyID</th>\n", + " <th>StudyInstanceUID</th>\n", + " <th>WindowCenter</th>\n", + " <th>WindowWidth</th>\n", + " <th>any</th>\n", + " <th>epidural</th>\n", + " <th>intraparenchymal</th>\n", + " <th>intraventricular</th>\n", + " <th>subarachnoid</th>\n", + " <th>subdural</th>\n", + " </tr>\n", + " <tr>\n", + " <th>Image</th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " <th></th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>ID_24250ffbc</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>train</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.920505</td>\n", + " <td>...</td>\n", + " <td></td>\n", + " <td>ID_6222a3935b</td>\n", + " <td>40.0</td>\n", + " <td>80.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_6e8c8d650</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>train</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.920505</td>\n", + " <td>...</td>\n", + " <td></td>\n", + " <td>ID_6222a3935b</td>\n", + " <td>40.0</td>\n", + " <td>80.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_ac042708d</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>train</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.920505</td>\n", + " <td>...</td>\n", + " <td></td>\n", + " <td>ID_6222a3935b</td>\n", + " <td>40.0</td>\n", + " <td>80.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_d1e2a17a9</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>train</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.927184</td>\n", + " <td>...</td>\n", + " <td></td>\n", + " <td>ID_a5fb903898</td>\n", + " <td>40.0</td>\n", + " <td>80.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>ID_e1a1b45a5</th>\n", + " <td>16</td>\n", + " <td>12</td>\n", + " <td>512</td>\n", + " <td>train</td>\n", + " <td>11</td>\n", + " <td>1.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.920505</td>\n", + " <td>...</td>\n", + " <td></td>\n", + " <td>ID_6222a3935b</td>\n", + " <td>40.0</td>\n", + " <td>80.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "<p>5 rows × 36 columns</p>\n", + "</div>" + ], + "text/plain": [ + " BitsAllocated BitsStored Columns Dataset HighBit \\\n", + "Image \n", + "ID_24250ffbc 16 12 512 train 11 \n", + "ID_6e8c8d650 16 12 512 train 11 \n", + "ID_ac042708d 16 12 512 train 11 \n", + "ID_d1e2a17a9 16 12 512 train 11 \n", + "ID_e1a1b45a5 16 12 512 train 11 \n", + "\n", + " ImageOrientationPatient_0 ImageOrientationPatient_1 \\\n", + "Image \n", + "ID_24250ffbc 1.0 0.0 \n", + "ID_6e8c8d650 1.0 0.0 \n", + "ID_ac042708d 1.0 0.0 \n", + "ID_d1e2a17a9 1.0 0.0 \n", + "ID_e1a1b45a5 1.0 0.0 \n", + "\n", + " ImageOrientationPatient_2 ImageOrientationPatient_3 \\\n", + "Image \n", + "ID_24250ffbc 0.0 0.0 \n", + "ID_6e8c8d650 0.0 0.0 \n", + "ID_ac042708d 0.0 0.0 \n", + "ID_d1e2a17a9 0.0 0.0 \n", + "ID_e1a1b45a5 0.0 0.0 \n", + "\n", + " ImageOrientationPatient_4 ... StudyID StudyInstanceUID \\\n", + "Image ... \n", + "ID_24250ffbc 0.920505 ... ID_6222a3935b \n", + "ID_6e8c8d650 0.920505 ... ID_6222a3935b \n", + "ID_ac042708d 0.920505 ... ID_6222a3935b \n", + "ID_d1e2a17a9 0.927184 ... ID_a5fb903898 \n", + "ID_e1a1b45a5 0.920505 ... ID_6222a3935b \n", + "\n", + " WindowCenter WindowWidth any epidural intraparenchymal \\\n", + "Image \n", + "ID_24250ffbc 40.0 80.0 0.0 0.0 0.0 \n", + "ID_6e8c8d650 40.0 80.0 0.0 0.0 0.0 \n", + "ID_ac042708d 40.0 80.0 0.0 0.0 0.0 \n", + "ID_d1e2a17a9 40.0 80.0 0.0 0.0 0.0 \n", + "ID_e1a1b45a5 40.0 80.0 0.0 0.0 0.0 \n", + "\n", + " intraventricular subarachnoid subdural \n", + "Image \n", + "ID_24250ffbc 0.0 0.0 0.0 \n", + "ID_6e8c8d650 0.0 0.0 0.0 \n", + "ID_ac042708d 0.0 0.0 0.0 \n", + "ID_d1e2a17a9 0.0 0.0 0.0 \n", + "ID_e1a1b45a5 0.0 0.0 0.0 \n", + "\n", + "[5 rows x 36 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "metadata.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "test_df = metadata[metadata['Dataset'] == 'test'].iloc[:,:-6].drop(['Dataset'], axis= 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "train_df = metadata[metadata['Dataset'] == 'train'].drop(['Dataset'], axis= 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(752802, 35)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(752802, 6)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_y = train_df[['any','epidural','intraparenchymal','intraventricular', 'subarachnoid','subdural']]\n", + "train_y.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Generator" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "from skimage.io import imread\n", + "def get_input(path):\n", + " \n", + " img = imread( path )\n", + " \n", + " return( img )" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from tensorflow.keras.applications.vgg19 import preprocess_input\n", + "def get_output( path, label_file = None ):\n", + " \n", + " img_id = path.split('/')[-1].split('.')[0]\n", + " labels = label_file.loc[img_id].values\n", + " \n", + " return(labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_input( image ):\n", + " image_size = (224,224)\n", + " transform = A.Compose([\n", + " A.Resize(*image_size),\n", + " A.HorizontalFlip(),\n", + " A.OneOf([\n", + " A.ElasticTransform(alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03,p=0.1),\n", + " A.GridDistortion(p=0.2),\n", + " A.OpticalDistortion(distort_limit=2, shift_limit=0.5,p=0.3),\n", + " ], p=0.3),\n", + " A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=10,p=0.1),\n", + " ])\n", + " random.seed(42) \n", + " augmented_image = cv2.resize(image, (224, 224), \n", + " interpolation = cv2.INTER_NEAREST) \n", + " #augmented_image = transform(image=image)['image']\n", + " #image = np.expand_dims(augmented_image, axis=0)\n", + " #image = preprocess_input(image)\n", + " return( augmented_image )" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [], + "source": [ + "def image_generator(files, label_file, batch_size = 64):\n", + " train_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/png/train/adjacent-brain-cropped/'\n", + " while True:\n", + " # Select files (paths/indices) for the batch\n", + " for index in range(len(files)):\n", + " batch_paths = files[index*batch_size:(index+1)*batch_size]\n", + " batch_input = []\n", + " batch_output = [] \n", + "\n", + " # Read in each input, perform preprocessing and get labels\n", + " for input_path in batch_paths:\n", + " input = get_input( train_url +input_path + '.png')\n", + " output = get_output(train_url + input_path + '.png',label_file=label_file )\n", + "\n", + " input = preprocess_input(image=input)\n", + " batch_input += [ input ]\n", + " batch_output += [ output ]\n", + " # Return a tuple of (input, output) to feed the network\n", + " batch_x = np.array( batch_input )\n", + " batch_y = np.array( batch_output )\n", + "\n", + " yield( batch_x, batch_y )" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [], + "source": [ + "train_url = '/home/ubuntu/kaggle/rsna-intracranial-hemorrhage-detection/png/train/adjacent-brain-cropped/'\n" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "train_generator = image_generator(train_df.index, train_y, batch_size = 32)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "l = next(train_generator)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(224, 224, 3)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "l[0][0].shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Epoch 3/3\n", + "5881/5881 [==============================] - 1368s 233ms/step - loss: 0.2791 - accuracy: 0.9298 - true_positives_9: 641.2561 - false_positives_9: 2328.5566 - true_negatives_9: 525110.8125 - false_negatives_9: 36590.9922 - auc_9: 0.6685\n" + ] + }, + { + "data": { + "text/plain": [ + "<keras.callbacks.callbacks.History at 0x7f0d874cfdd8>" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# create the base pre-trained model\n", + "base_model = InceptionV3(weights='imagenet', include_top=False, input_shape=(224,224,3))\n", + "for layer in base_model.layers[:-2]:\n", + " layer.trainable = False\n", + "\n", + "x = base_model.output\n", + "x = GlobalAveragePooling2D()(x)\n", + "#x = Dense(1024, activation='relu')(x)\n", + "#x = Dense(256, activation='relu')(x)\n", + "predictions = Dense(6, activation='softmax')(x)\n", + "\n", + "adam = keras.optimizers.Adam(learning_rate=0.00001,\n", + " beta_1=0.9,\n", + " beta_2=0.999,\n", + " amsgrad=False)\n", + "\n", + "# this is the model we will train\n", + "model = Model(inputs=base_model.input, outputs=predictions)\n", + "model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy',tf.keras.metrics.TruePositives(),tf.keras.metrics.FalsePositives(), tf.keras.metrics.TrueNegatives(),tf.keras.metrics.FalseNegatives(), tf.keras.metrics.AUC()])\n", + "\n", + "BATCH_SIZE = 128\n", + "NUM_TRAIN_IMAGES = train_df.shape[0]\n", + "# Train model on dataset\n", + "model.fit_generator(train_generator, initial_epoch =2,epochs = 3,steps_per_epoch= NUM_TRAIN_IMAGES // BATCH_SIZE, verbose = 1)\n", + "\n", + "\n", + "# In[ ]:\n", + "\n", + "\n", + "\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model weights and architecture saved.\n" + ] + } + ], + "source": [ + "\n", + "# Save Weights and architecture\n", + "model.save_weights('model_weights-BC_3.h5')\n", + "\n", + "\n", + "# Save the model architecture\n", + "with open('model_architecture-BC_3.json', 'w') as f:\n", + " f.write(model.to_json())\n", + "\n", + "print(\"Model weights and architecture saved.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "<generator object image_generator at 0x7efefb7da678>" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train_generator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Environment (conda_tensorflow2_p36)", + "language": "python", + "name": "conda_tensorflow2_p36" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}