Machine-Learning-for-Dise / Git / [4bdf3e] /FinalTestPCR.ipynb

Models:
joseph-gordon/
Machine-Learning-for-Dise
Downloads: 1
[4bdf3e]: / FinalTestPCR.ipynb
History
Download this file
148 lines (147 with data), 3.7 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 65,
   "id": "09f2ef64",
   "metadata": {},
   "outputs": [],
   "source": [
    "# importing the model\n",
    "\n",
    "import joblib\n",
    "model = joblib.load('XGBoost_final.pkl')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "id": "d4325fde",
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# loading the test dataset\n",
    "\n",
    "import pandas as pd\n",
    "test_Df = pd.read_excel('TestDatasetExample (1).xls') # change this to the required file name/path."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "id": "bca2827b",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "\n",
    "# preprocessing and null value removal.\n",
    "\n",
    "test_Df=test_Df.replace(999, None)\n",
    "\n",
    "#imputation using mode\n",
    "for col in test_Df.columns:\n",
    "    test_Df[col].fillna(test_Df[col].mode()[0], inplace=True)\n",
    "\n",
    "# only column 11 onwards taken for pca\n",
    "test_Df_forPCA = test_Df.iloc[:,11:]\n",
    "\n",
    "# Standardize the features\n",
    "scaler = StandardScaler()\n",
    "test_Df_forPCA = scaler.fit_transform(test_Df_forPCA)\n",
    "\n",
    "# Perform PCA\n",
    "pca = PCA(n_components=6)  # Reduce to 6 principal components\n",
    "test_Df_afterPCA = pca.fit_transform(test_Df_forPCA)\n",
    "\n",
    "test_Df_afterPCA = pd.DataFrame(test_Df_afterPCA)\n",
    "test_Df = test_Df.iloc[:,0:11].merge(test_Df_afterPCA, left_index = True, right_index = True, how = 'right')\n",
    "\n",
    "new_column_names = {0: 'COMP0', 1: 'COMP1', 2: 'COMP2',3: 'COMP3',4:'COMP4',5:'COMP5',}\n",
    "test_Df = test_Df.rename(columns=new_column_names)\n",
    "\n",
    "# scaling all the values\n",
    "test_Df[['Age']] = StandardScaler().fit_transform(test_Df[['Age']])\n",
    "\n",
    "#drop proliferation as its not needed.\n",
    "test_Df = test_Df.drop(columns=['Proliferation'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "id": "42d4a3c7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Making predictions on data\n",
    "test_predictions = model.predict(test_Df.drop(columns=['ID']))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "id": "da1931da",
   "metadata": {},
   "outputs": [],
   "source": [
    "test_predictions = pd.DataFrame(test_predictions, columns=['pCR(result)'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "id": "3fba86a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "to_excel_Df = pd.DataFrame(test_Df['ID'])\n",
    "#data after handling missing values\n",
    "to_excel_Df = pd.concat((to_excel_Df, test_predictions), axis=1)\n",
    "to_excel_Df.to_excel('FinalTestPCR.xlsx', index=False, header=True)\n",
    "\n",
    "                        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45827c84",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77571afa",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}