--- a +++ b/FinalTestPCR.ipynb @@ -0,0 +1,147 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 65, + "id": "09f2ef64", + "metadata": {}, + "outputs": [], + "source": [ + "# importing the model\n", + "\n", + "import joblib\n", + "model = joblib.load('XGBoost_final.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "d4325fde", + "metadata": { + "scrolled": false + }, + "outputs": [], + "source": [ + "# loading the test dataset\n", + "\n", + "import pandas as pd\n", + "test_Df = pd.read_excel('TestDatasetExample (1).xls') # change this to the required file name/path." + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "bca2827b", + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn.decomposition import PCA\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "# preprocessing and null value removal.\n", + "\n", + "test_Df=test_Df.replace(999, None)\n", + "\n", + "#imputation using mode\n", + "for col in test_Df.columns:\n", + " test_Df[col].fillna(test_Df[col].mode()[0], inplace=True)\n", + "\n", + "# only column 11 onwards taken for pca\n", + "test_Df_forPCA = test_Df.iloc[:,11:]\n", + "\n", + "# Standardize the features\n", + "scaler = StandardScaler()\n", + "test_Df_forPCA = scaler.fit_transform(test_Df_forPCA)\n", + "\n", + "# Perform PCA\n", + "pca = PCA(n_components=6) # Reduce to 6 principal components\n", + "test_Df_afterPCA = pca.fit_transform(test_Df_forPCA)\n", + "\n", + "test_Df_afterPCA = pd.DataFrame(test_Df_afterPCA)\n", + "test_Df = test_Df.iloc[:,0:11].merge(test_Df_afterPCA, left_index = True, right_index = True, how = 'right')\n", + "\n", + "new_column_names = {0: 'COMP0', 1: 'COMP1', 2: 'COMP2',3: 'COMP3',4:'COMP4',5:'COMP5',}\n", + "test_Df = test_Df.rename(columns=new_column_names)\n", + "\n", + "# scaling all the values\n", + "test_Df[['Age']] = StandardScaler().fit_transform(test_Df[['Age']])\n", + "\n", + "#drop proliferation as its not needed.\n", + "test_Df = test_Df.drop(columns=['Proliferation'])" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "id": "42d4a3c7", + "metadata": {}, + "outputs": [], + "source": [ + "# Making predictions on data\n", + "test_predictions = model.predict(test_Df.drop(columns=['ID']))" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "id": "da1931da", + "metadata": {}, + "outputs": [], + "source": [ + "test_predictions = pd.DataFrame(test_predictions, columns=['pCR(result)'])" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "3fba86a0", + "metadata": {}, + "outputs": [], + "source": [ + "to_excel_Df = pd.DataFrame(test_Df['ID'])\n", + "#data after handling missing values\n", + "to_excel_Df = pd.concat((to_excel_Df, test_predictions), axis=1)\n", + "to_excel_Df.to_excel('FinalTestPCR.xlsx', index=False, header=True)\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45827c84", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77571afa", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}