--- a +++ b/Lung Cancer Prediction System.ipynb @@ -0,0 +1,576 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "788d3638", + "metadata": {}, + "outputs": [], + "source": [ + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "\n", + "import math\n", + "\n", + "import glob\n", + "from tkinter import *\n", + "from tkinter import filedialog\n", + "from PIL import ImageTk, Image\n", + "import os\n", + "import imghdr\n", + "import cv2 as cv\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from numpy import sqrt\n", + "\n", + "from skimage.transform import pyramid_reduce, resize\n", + "\n", + "import skimage.filters as filters\n", + "\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score\n", + "\n", + "from sklearn.utils import shuffle\n", + " \n", + "#models\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.linear_model import LogisticRegression\n", + "from sklearn.svm import SVC\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow.keras import datasets, layers, models\n", + "\n", + "from keras.layers import Conv2D, MaxPooling2D, Dense, Dropout, Input, Flatten, Activation\n", + "\n", + "#preprocessing part\n", + "from skimage.segmentation import clear_border\n", + "from skimage.measure import label,regionprops, perimeter\n", + "from skimage.morphology import ball, disk, binary_erosion, remove_small_objects, reconstruction, binary_closing, binary_opening\n", + "from skimage.filters import roberts, sobel\n", + "from scipy import ndimage as ndi" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "83fce547", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total: 1385\n" + ] + } + ], + "source": [ + "#get all classes names\n", + "classes_old = [\"Bengin\",\"Malignant\",\"Normal\"]\n", + "classes = [\"Benign\",\"Malignant\",\"Normal\"]\n", + "\n", + "#get all images location(total 1097 img)\n", + "img_list = sorted(glob.glob('dataset/after_preprocessing/train_test/*/*/*.*'))\n", + "print('Total: ', len(img_list))\n", + "\n", + "#Define variable to hold X & y\n", + "#create numpy array placeholder for pixels with 1 channel(grayscale)\n", + "IMG_SIZE = 528\n", + "CHANNEL = 1\n", + "#arg: (length of numpy set, height, width, color channel)\n", + "X_segmented = np.empty((len(img_list), IMG_SIZE, IMG_SIZE), dtype=np.uint8)\n", + "\n", + "y = []\n", + "\n", + "# convert images to numpy arrays\n", + "for i, img_path in enumerate(img_list):\n", + " # load image\n", + " img = cv.imread(img_path, cv.IMREAD_GRAYSCALE)\n", + " img = cv.resize(img, (IMG_SIZE, IMG_SIZE))\n", + " X_segmented[i] = img\n", + " y.append(classes[0]) if img_path.find(classes_old[0]) != -1 else (y.append(classes[1]) if img_path.find(classes_old[1]) != -1 else y.append(classes_old[2]))\n", + " \n", + "y = pd.Series(y)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "7957ef48", + "metadata": {}, + "outputs": [], + "source": [ + "IMG_SIZE = 128\n", + "CHANNEL = 1\n", + "\n", + "X_dl_segmented_resize = np.empty((len(img_list), IMG_SIZE, IMG_SIZE), dtype=np.uint8)\n", + "# X_dl_nodules_resize = np.empty((len(img_list), IMG_SIZE, IMG_SIZE), dtype=np.uint8)\n", + "# X_dl_nodules_further_resize = np.empty((len(img_list), IMG_SIZE, IMG_SIZE), dtype=np.uint8)\n", + "\n", + "for i, img in enumerate(X_segmented):\n", + " X_dl_segmented_resize[i] = cv.resize(img, (IMG_SIZE, IMG_SIZE))\n", + " \n", + "# for i, img in enumerate(X_nodules):\n", + "# X_nodules_resize[i] = cv.resize(img, (IMG_SIZE, IMG_SIZE))\n", + " \n", + "# for i, img in enumerate(X_nodules_further):\n", + "# X_nodules_further_resize[i] = cv.resize(img, (IMG_SIZE, IMG_SIZE))\n", + "\n", + "# convert to 3d array\n", + "X_dl_segmented_resize = X_dl_segmented_resize.reshape(-1, IMG_SIZE, IMG_SIZE, CHANNEL)\n", + "# X_dl_nodules_resize = X_nodules_resize.reshape(-1, IMG_SIZE, IMG_SIZE, CHANNEL)\n", + "# X_dl_nodules_further_resize = X_nodules_further_resize.reshape(-1, IMG_SIZE, IMG_SIZE, CHANNEL)\n", + "\n", + "# convert to 2d array\n", + "X_ml_segmented_resize = X_dl_segmented_resize.reshape(X_dl_segmented_resize.shape[0], -1)\n", + "\n", + "# #randomizedSearchCV Dataset\n", + "# X_rscv_segmented = X_segmented.reshape(X_segmented.shape[0], -1)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "8a4fe6cc", + "metadata": {}, + "outputs": [], + "source": [ + "#split into train and test and factorize the label\n", + "X_dl_train_segmented = X_dl_segmented_resize[300:1385]\n", + "X_dl_test_segmented = X_dl_segmented_resize[0:300]\n", + "\n", + "X_ml_train_segmented = X_ml_segmented_resize[300:1385]\n", + "X_ml_test_segmented = X_ml_segmented_resize[0:300]\n", + "\n", + "# X_train_nodules = X_nodules_resize[300:1385]\n", + "# X_test_nodules = X_nodules_resize[0:300]\n", + "# X_train_nodules_further = X_nodules_further_resize[300:1385]\n", + "# X_test_nodules_further = X_nodules_further_resize[0:300]\n", + "\n", + "y_train = y[300:1385]\n", + "y_test = y[0:300]\n", + "r_rscv = y.copy()\n", + "\n", + "y_train = pd.Series(y_train)\n", + "y_test = pd.Series(y_test)\n", + "y_rscv = pd.Series(r_rscv)\n", + "\n", + "y_train = y_train.factorize()\n", + "y_test = y_test.factorize()\n", + "y_rscv = r_rscv.factorize()\n", + "\n", + "xtr_dl, xts_dl, ytr_dl, yts_dl = train_test_split(X_dl_segmented_resize, y_rscv[0], test_size=math.floor(len(X_dl_segmented_resize)*0.2167), \n", + " random_state = np.random.randint(1,1000, 1)[0])\n", + "xtr_ml, xts_ml, ytr_ml, yts_ml = train_test_split(X_ml_segmented_resize, y_rscv[0], test_size=math.floor(len(X_ml_segmented_resize)*0.2167), \n", + " random_state = np.random.randint(1,1000, 1)[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "fd9e020b", + "metadata": {}, + "outputs": [], + "source": [ + "cnn_svm = models.Sequential([\n", + " layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=(IMG_SIZE, IMG_SIZE, CHANNEL)),\n", + " layers.MaxPooling2D((2, 2)),\n", + " \n", + " layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),\n", + " layers.MaxPooling2D((2, 2)),\n", + " \n", + " layers.Conv2D(filters=128, kernel_size=(3, 3), activation='relu'),\n", + " layers.MaxPooling2D((2, 2)),\n", + " \n", + " layers.Conv2D(filters=256, kernel_size=(3, 3), activation='relu'),\n", + " layers.MaxPooling2D((2, 2)),\n", + " \n", + " #dense network\n", + " #CNN middle layer no need to specify the shape because the network can figure it out automatically\n", + " layers.Flatten(),\n", + " ])\n", + "\n", + "X_full_cnn_svm = cnn_svm.predict(X_dl_segmented_resize)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "1113289b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1385, 9216)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "X_full_cnn_svm.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6ed00b00", + "metadata": {}, + "outputs": [], + "source": [ + "xtr_hybird, xts_hybird, ytr_hybird, yts_hybird = train_test_split(X_full_cnn_svm, y_rscv[0], test_size=math.floor(len(X_dl_segmented_resize)*0.2167), \n", + " random_state = np.random.randint(1,1000, 1)[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f0b854f2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Random Forest Accuracy: 0.9233333333333333\n" + ] + } + ], + "source": [ + "loaded_model = RandomForestClassifier(n_estimators = 200, min_samples_split = 2, min_samples_leaf = 2,\n", + " max_features = 'sqrt', max_depth = 80, bootstrap = False)\n", + "\n", + "loaded_model.fit(xtr_hybird, ytr_hybird)\n", + "\n", + "y_pred_cnn_rf_bestParam = loaded_model.predict(xts_hybird)\n", + "cnn_rf_bestParam_segmented_score = accuracy_score(yts_hybird, y_pred_cnn_rf_bestParam)\n", + "print('Random Forest Accuracy: ', cnn_rf_bestParam_segmented_score)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "e2f618e8", + "metadata": {}, + "outputs": [], + "source": [ + "def get_segmented_lungs(im, num, save=False, plot=False, show_on_window=False, crop_percentage=0.05):\n", + " #This funtion segments the lungs from the given 2D slice.\n", + " \n", + " crop = im.copy()\n", + " if show_on_window:\n", + " height,width=im.shape[:2]\n", + " start_row,start_col=int(height*crop_percentage),int(width*crop_percentage)\n", + " end_row,end_col=int(height*(1-crop_percentage)),int(width*(1-crop_percentage))\n", + " crop=crop[start_row:end_row,start_col:end_col]\n", + " else:\n", + " if num == 161 or (num >= 173 and num <= 174) or (num == 758):\n", + " height,width=im.shape[:2]\n", + " start_row,start_col=int(height*0.20),int(width*0.20)\n", + " end_row,end_col=int(height*0.80),int(width*0.80)\n", + " crop=crop[start_row:end_row,start_col:end_col]\n", + " elif num >= 756 and num <= 767:\n", + " #Step 1: Crop the image \n", + " height,width=im.shape[:2]\n", + " start_row,start_col=int(height*0),int(width*0)\n", + " end_row,end_col=int(height*1),int(width*1)\n", + " crop=crop[start_row:end_row,start_col:end_col]\n", + " elif num == 1320 or num == 1219 or (num >= 712 and num <= 767) or (num >= 779 and num <= 799) or (num >= 688 and num <= 699) or (num >= 648 and num <= 664) or (num >= 225 and num <= 234):\n", + " #Step 1: Crop the image \n", + " height,width=im.shape[:2]\n", + " start_row,start_col=int(height*0.03),int(width*0.03)\n", + " end_row,end_col=int(height*0.97),int(width*0.97)\n", + " crop=crop[start_row:end_row,start_col:end_col]\n", + " else:\n", + " #Step 1: Crop the image \n", + " height,width=im.shape[:2]\n", + " start_row,start_col=int(height*0.12),int(width*0.12)\n", + " end_row,end_col=int(height*0.88),int(width*0.88)\n", + " crop=crop[start_row:end_row,start_col:end_col]\n", + " \n", + " #Step 2: Convert into a binary image. \n", + " ret,binary = cv.threshold(crop,140,255,cv.THRESH_BINARY_INV)\n", + " \n", + " #Step 3: Remove the blobs connected to the border of the image.\n", + " cleared = clear_border(binary) \n", + " \n", + " #Step 4: Closure operation with a disk of radius 10. This operation is \n", + " #to keep nodules attached to the lung wall.\n", + " selem = disk(2)\n", + " closing = binary_closing(cleared, selem)\n", + " \n", + " #Step 5: Label the image.\n", + " label_image = label(closing)\n", + " \n", + " #Step 6: Keep the labels with 2 largest areas.\n", + " areas = [r.area for r in regionprops(label_image)]\n", + " areas.sort()\n", + " if len(areas) > 2:\n", + " for region in regionprops(label_image):\n", + " if region.area < areas[-2]:\n", + " for coordinates in region.coords: \n", + " label_image[coordinates[0], coordinates[1]] = 0\n", + " segmented_area = label_image > 0\n", + " \n", + " #Step 7: Erosion operation with a disk of radius 2. This operation is \n", + " #seperate the lung nodules attached to the blood vessels.\n", + " selem = disk(2)\n", + " erosion = binary_erosion(segmented_area, selem) \n", + " \n", + " # Step 4: Closure operation with a disk of radius 10. This operation is \n", + " # to keep nodules attached to the lung wall.\n", + " selem = disk(10)\n", + " closing2 = binary_closing(erosion, selem) \n", + " \n", + " #Step 8: Fill in the small holes inside the binary mask of lungs.\n", + " edges = roberts(closing2)\n", + " fill_holes = ndi.binary_fill_holes(edges)\n", + " \n", + " superimpose = crop.copy()\n", + " #Step 9: Superimpose叠加 the binary mask on the input image.\n", + " get_high_vals = fill_holes == 0\n", + " superimpose[get_high_vals] = 0\n", + "\n", + " superimpose = cv.resize(superimpose, (528, 528)) \n", + " \n", + " if show_on_window:\n", + " directory1 = 'result/'\n", + " directory2 = '.jpg'\n", + " images = [im, crop, binary, cleared, closing, segmented_area, erosion, closing2, fill_holes, superimpose]\n", + " titles = ['0_original_image', '1_cropped_image', '2_binary_image', '3_remove_blobs', '4_closure', '5_roi', '6_erosion', '7_closure', '8_fill_hole', '9_result']\n", + " for i, title in enumerate(titles):\n", + " filename = directory1 + title + directory2\n", + " try:\n", + " cv.imwrite(filename, images[i])\n", + " except:\n", + " indices = images[i].astype(np.uint8) #convert to an unsigned byte\n", + " indices*=255\n", + " cv.imwrite(filename, indices)\n", + " else:\n", + " #flip vertically\n", + " directory1 = 'preprocessing/pre1/'\n", + " directory2 = '.jpg'\n", + " images = [crop, binary, cleared, label_image, superimpose]\n", + " titles = ['cropped_image', 'binary_image', 'remove_blobs', 'label', 'result']\n", + "\n", + " if save:\n", + " for y in range(5):\n", + " filename = directory1 + str(y+1) + titles[y] + '/' + titles[y] + str(num+1) + directory2\n", + " cv.imwrite(filename, images[y])\n", + "\n", + " images = [im, crop, binary, cleared, closing, label_image, segmented_area, erosion, closing2, fill_holes, superimpose]\n", + " \n", + " if plot:\n", + " titles = ['Original Image', \n", + " 'Step 1: Cropped Image', \n", + " 'Step 2: Binary image', \n", + " 'Step 3: Remove blobs', \n", + " 'Step 4: Closure', \n", + " 'Step 5: Label', \n", + " 'Step 6: Region On Interest',\n", + " 'Step 7: Erosion',\n", + " 'Step 8: Closure', \n", + " 'Step 9: Fill Holes',\n", + " 'Step 10: Result']\n", + " plot_img(images, titles, camp=plt.cm.bone, rows = 3, cols = 4, fontsize= 50)\n", + " \n", + "# if show_on_window:\n", + "# directory1 = 'result/'\n", + "# directory2 = '.jpg'\n", + "# titles = ['0_original_image', '1_cropped_image', '2_binary_image', '3_remove_blobs', '4_closure', '5_roi', '6_erosion', '7_fill_hole', '8_result']\n", + "# for i, title in enumerate(titles):\n", + "# filename = directory1 + title + directory2\n", + "# try:\n", + "# cv.imwrite(filename, images[i])\n", + "# except:\n", + "# indices = images[i].astype(np.uint8) #convert to an unsigned byte\n", + "# indices*=255\n", + "# cv.imwrite(filename, indices)\n", + " \n", + " return superimpose" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d52cdbbf", + "metadata": {}, + "outputs": [], + "source": [ + "#define a function to convert the y_pred, y_test to human readable (from 0,1,2... to inclusion, pitted....)\n", + "def convertLabels(y_test, classes):\n", + " return classes[y_test]" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "07894d70", + "metadata": {}, + "outputs": [], + "source": [ + "root = Tk()\n", + "root.title('Lung Cancer Prediction System')\n", + "width= root.winfo_screenwidth() \n", + "height= root.winfo_screenheight() \n", + "root.geometry(\"%dx%d\" %(width, height))\n", + "\n", + "def openfn():\n", + " global image_path\n", + " filename = filedialog.askopenfilename(initialdir=\"test\", title =\"Select a CT-Scan Image\", filetypes=((\"jpg files\",\"*.jpg*\"),(\"png files\",\"*.png\"),(\"jpeg files\",\"*.jpeg\")))\n", + " image_path = filename\n", + "# return filename\n", + "\n", + "def open_img():\n", + "# img_path = openfn()\n", + " global image_path\n", + " openfn()\n", + " if imghdr.what(image_path) == 'png' or imghdr.what(image_path) == 'jpeg' or imghdr.what(image_path) == 'jpg':\n", + " img = cv.imread(image_path, cv.IMREAD_GRAYSCALE)\n", + " img = cv.resize(img, (528, 528))\n", + " \n", + " numOfImg = 1\n", + " test = img.copy()\n", + " segment_result = get_segmented_lungs(test, numOfImg, show_on_window=True, crop_percentage=float(int(clicked.get())/100))\n", + " \n", + " plot_on_app()\n", + " get_prediction_result()\n", + " \n", + "def plot_on_app():\n", + " result_list = sorted(glob.glob('result/*.*'))\n", + " coor_x = 82\n", + " coor_y = 62\n", + " for i, result_path in enumerate(result_list):\n", + " if i == 5:\n", + " coor_y = coor_y + 320\n", + " coor_x = 82\n", + " img = Image.open(result_path)\n", + " img = img.resize((242, 242))\n", + " image = ImageTk.PhotoImage(img)\n", + "\n", + " label_image = Label(image=image)\n", + " label_image.image = image\n", + " label_image.place(x=coor_x,y=coor_y)\n", + "\n", + " coor_x = coor_x + 280\n", + "\n", + "def get_prediction_result():\n", + " img = cv.imread('result/9_result.jpg', cv.IMREAD_GRAYSCALE)\n", + " img = cv.resize(img, (128, 128))\n", + " predict_img = np.expand_dims(img, 0)\n", + " predict_img = predict_img.reshape(-1, 128, 128, 1)\n", + " extraction_predict_img = cnn_svm.predict(predict_img)\n", + " result_predict_img = loaded_model.predict(extraction_predict_img)\n", + " result_predict_img_converted = convertLabels(result_predict_img[0], classes)\n", + " result_label.config(text='Prediction Result : ' + str(result_predict_img_converted))\n", + " \n", + "# Change the label text\n", + "def recrop():\n", + " global image_path\n", + " if image_path != '':\n", + " img = cv.imread(image_path, cv.IMREAD_GRAYSCALE)\n", + " img = cv.resize(img, (528, 528))\n", + " numOfImg = 1\n", + " test = img.copy()\n", + " get_segmented_lungs(test, numOfImg, show_on_window=True, crop_percentage=float(int(clicked.get())/100))\n", + " plot_on_app()\n", + " get_prediction_result()\n", + " \n", + "coor_x = 80\n", + "coor_y = 60\n", + "for i in range(10):\n", + " if i == 5:\n", + " coor_y = coor_y + 320\n", + " coor_x = 80\n", + " Frame(root, highlightbackground=\"black\", highlightthickness=2,width=250, height=250).place(x=coor_x, y=coor_y)\n", + " coor_x = coor_x + 280\n", + " \n", + "txt1 = 'Select a CT-Scan to predict ->'\n", + "Label(root, text=txt1, font=('Times', '18', 'italic')).place(x = 40, y = 10)\n", + "\n", + "Button(root, text='Select a CT-Scan Image', font=('Times', '13', 'italic'), command=open_img, bg='#C7C6C1', bd=3).place(x = 350, y = 10) \n", + "titles = ['Original Image', 'Step 1: Cropped Image', 'Step 2: Binary image', 'Step 3: Remove blobs', 'Step 4: Closure'\n", + " , 'Step 5: Region On Interest', 'Step 6: Erosion', 'Step 7: Closure', 'Step 8: Fill Holes', 'Step 9: Segmented Result']\n", + "\n", + "txt1 = 'Prediction Result : ' \n", + "result_label = Label(root, text=txt1, font=('Times', '18', 'italic'))\n", + "result_label.place(x = 40, y = 710)\n", + "\n", + "txt1 = 'Average Test Accuracy : 0.9386, Standard Deviation : 0.0246' \n", + "Label(root, text=txt1, font=('Times', '18', 'italic')).place(x = 770, y = 690)\n", + "\n", + "txt1 = 'Average Precision Score : 0.95, Average Recall Score : 0.94, Average F1 Score : 0.94'\n", + "Label(root, text=txt1, font=('Times', '18', 'italic')).place(x = 650, y = 730)\n", + "\n", + "txt1 = 'Crop : %'\n", + "Label(root, text=txt1, font=('Times', '18', 'italic')).place(x = 700, y = 10)\n", + "\n", + "#txt1 = ''\n", + "#Label(root, text=text1, font=('Times', '18', 'italic')).place(x = 200, y = 700)\n", + "\n", + "# Dropdown menu options\n", + "options = [\n", + " \"0\", \"1\", \"2\", \"3\", \"4\", \"5\", \"6\", \"7\", \"8\", \"9\", \"10\",\n", + " \"11\", \"12\", \"13\", \"14\", \"15\", \"16\", \"17\", \"18\", \"19\", \"20\",\n", + " \"21\", \"22\", \"23\", \"24\", \"25\", \"26\", \"27\", \"28\", \"29\", \"30\"\n", + "]\n", + " \n", + "# datatype of menu text\n", + "clicked = StringVar()\n", + " \n", + "# initial menu text\n", + "clicked.set( \"5\" )\n", + " \n", + "# Create Dropdown menu\n", + "drop = OptionMenu( root , clicked , *options).place(x = 772, y = 10)\n", + "\n", + "Button(root, text='Reload', command=recrop, font=('Times', '13', 'italic'), bg='#C7C6C1', bd=3).place(x = 890, y = 10)\n", + "\n", + "coor_x = [130, 370, 660, 940, 1250, 70, 405, 690, 960, 1200]\n", + "coor_y = 310\n", + "for i in range(10):\n", + " if i == 5:\n", + " coor_y = coor_y + 320\n", + " Label(root, text=titles[i], font=('Times', '18', 'italic')).place(x = coor_x[i], y = coor_y)\n", + "\n", + "#variable declare\n", + "image_path = ''\n", + " \n", + "root.mainloop()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "319121c2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}