[624d49]: / Regression RFS / FinalTestRFS.ipynb

Download this file

154 lines (153 with data), 5.3 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from sklearn.experimental import enable_iterative_imputer\n",
    "from sklearn.impute import IterativeImputer\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_df = pd.read_excel(\"FinalTestDataset2024.xls\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read the RandomForestClassification model using pickle\n",
    "with open(\"svr_test.pickle\", \"rb\") as f:\n",
    "    SVR = pickle.load(f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Find missing values in rows\n",
    "missing_values_index = np.where(test_df == 999)[0]\n",
    "\n",
    "# Find index where missing values are more than 4\n",
    "drop_index = [\n",
    "    index for index in set(missing_values_index)\n",
    "    if (test_df.iloc[index] == 999).sum() >= 4\n",
    "]\n",
    "\n",
    "# Drop the rows where missing values are more than 4\n",
    "test_df = test_df.drop(drop_index).reset_index(drop=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "ID_data = test_df['ID']\n",
    "\n",
    "# Drop the 'ID' from test_df\n",
    "test_df.drop('ID', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Replace 999 with Nan\n",
    "missing_values_index = np.where(test_df == 999)\n",
    "new_df = test_df.replace(999, np.NaN)\n",
    "\n",
    "# И IterativeImputer\n",
    "multivariate_imp = IterativeImputer(random_state=42)\n",
    "multi_imputed_array = multivariate_imp.fit_transform(new_df)\n",
    "\n",
    "# Round imputed values\n",
    "for row, col in zip(*missing_values_index):\n",
    "    multi_imputed_array[row, col] = np.round(multi_imputed_array[row, col])\n",
    "\n",
    "# Create a DataFrame from the imputed array, with the columns and index of original dataframe\n",
    "multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=test_df.columns)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Feature which we found using feature selection in the training dataset\n",
    "feature_selection_list = ['original_shape_Maximum2DDiameterColumn', 'original_firstorder_90Percentile', 'original_glcm_JointEntropy', 'original_glcm_Imc1', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'original_glrlm_RunPercentage', 'original_firstorder_Variance', 'ChemoGrade', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterSlice', 'TumourStage', 'original_shape_Sphericity', 'original_glszm_SizeZoneNonUniformity', 'original_firstorder_Range', 'original_glcm_SumEntropy', 'original_firstorder_RootMeanSquared', 'original_shape_Maximum2DDiameterRow', 'original_glcm_JointEnergy', 'Gene', 'original_gldm_DependenceNonUniformityNormalized', 'original_glszm_SmallAreaHighGrayLevelEmphasis', 'original_shape_Maximum3DDiameter', 'original_firstorder_MeanAbsoluteDeviation', 'original_shape_MinorAxisLength', 'original_glszm_ZoneEntropy', 'original_glcm_MaximumProbability', 'original_firstorder_10Percentile', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_firstorder_Maximum', 'original_glszm_SizeZoneNonUniformityNormalized', 'ER', 'original_firstorder_Kurtosis', 'HER2', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_shape_MajorAxisLength', 'original_shape_Elongation', 'original_glszm_LowGrayLevelZoneEmphasis', 'Age', 'original_glcm_SumSquares', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_firstorder_InterquartileRange']\n",
    "\n",
    "feature_selected = multi_imputed_df[feature_selection_list]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "scaler = StandardScaler()\n",
    "Xs_train = scaler.fit_transform(feature_selected)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "predictions = SVR.predict(Xs_train)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "target_df = pd.DataFrame({'ID': ID_data, 'RelapseFreeSurvival (outcome)': predictions})\n",
    "target_df.to_csv('RFSPrediction.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}