148 lines (147 with data), 3.7 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 65,
"id": "09f2ef64",
"metadata": {},
"outputs": [],
"source": [
"# importing the model\n",
"\n",
"import joblib\n",
"model = joblib.load('XGBoost_final.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 60,
"id": "d4325fde",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"# loading the test dataset\n",
"\n",
"import pandas as pd\n",
"test_Df = pd.read_excel('TestDatasetExample (1).xls') # change this to the required file name/path."
]
},
{
"cell_type": "code",
"execution_count": 61,
"id": "bca2827b",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.decomposition import PCA\n",
"from sklearn.preprocessing import StandardScaler\n",
"\n",
"# preprocessing and null value removal.\n",
"\n",
"test_Df=test_Df.replace(999, None)\n",
"\n",
"#imputation using mode\n",
"for col in test_Df.columns:\n",
" test_Df[col].fillna(test_Df[col].mode()[0], inplace=True)\n",
"\n",
"# only column 11 onwards taken for pca\n",
"test_Df_forPCA = test_Df.iloc[:,11:]\n",
"\n",
"# Standardize the features\n",
"scaler = StandardScaler()\n",
"test_Df_forPCA = scaler.fit_transform(test_Df_forPCA)\n",
"\n",
"# Perform PCA\n",
"pca = PCA(n_components=6) # Reduce to 6 principal components\n",
"test_Df_afterPCA = pca.fit_transform(test_Df_forPCA)\n",
"\n",
"test_Df_afterPCA = pd.DataFrame(test_Df_afterPCA)\n",
"test_Df = test_Df.iloc[:,0:11].merge(test_Df_afterPCA, left_index = True, right_index = True, how = 'right')\n",
"\n",
"new_column_names = {0: 'COMP0', 1: 'COMP1', 2: 'COMP2',3: 'COMP3',4:'COMP4',5:'COMP5',}\n",
"test_Df = test_Df.rename(columns=new_column_names)\n",
"\n",
"# scaling all the values\n",
"test_Df[['Age']] = StandardScaler().fit_transform(test_Df[['Age']])\n",
"\n",
"#drop proliferation as its not needed.\n",
"test_Df = test_Df.drop(columns=['Proliferation'])"
]
},
{
"cell_type": "code",
"execution_count": 62,
"id": "42d4a3c7",
"metadata": {},
"outputs": [],
"source": [
"# Making predictions on data\n",
"test_predictions = model.predict(test_Df.drop(columns=['ID']))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "da1931da",
"metadata": {},
"outputs": [],
"source": [
"test_predictions = pd.DataFrame(test_predictions, columns=['pCR(result)'])"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "3fba86a0",
"metadata": {},
"outputs": [],
"source": [
"to_excel_Df = pd.DataFrame(test_Df['ID'])\n",
"#data after handling missing values\n",
"to_excel_Df = pd.concat((to_excel_Df, test_predictions), axis=1)\n",
"to_excel_Df.to_excel('FinalTestPCR.xlsx', index=False, header=True)\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "45827c84",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "77571afa",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}