Breast-Cancer-Regression / Git / Diff of /Regression RFS/FinalTestRFS.ipynb

Models:
joseph-gordon/
Breast-Cancer-Regression
Downloads: 1
Diff of /Regression RFS/FinalTestRFS.ipynb [000000] .. [624d49]
Switch to side-by-side view

--- a
+++ b/Regression  RFS/FinalTestRFS.ipynb
@@ -0,0 +1,153 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.experimental import enable_iterative_imputer\n",
+    "from sklearn.impute import IterativeImputer\n",
+    "from sklearn.preprocessing import StandardScaler\n",
+    "import pickle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_df = pd.read_excel(\"FinalTestDataset2024.xls\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Read the RandomForestClassification model using pickle\n",
+    "with open(\"svr_test.pickle\", \"rb\") as f:\n",
+    "    SVR = pickle.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Find missing values in rows\n",
+    "missing_values_index = np.where(test_df == 999)[0]\n",
+    "\n",
+    "# Find index where missing values are more than 4\n",
+    "drop_index = [\n",
+    "    index for index in set(missing_values_index)\n",
+    "    if (test_df.iloc[index] == 999).sum() >= 4\n",
+    "]\n",
+    "\n",
+    "# Drop the rows where missing values are more than 4\n",
+    "test_df = test_df.drop(drop_index).reset_index(drop=True)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ID_data = test_df['ID']\n",
+    "\n",
+    "# Drop the 'ID' from test_df\n",
+    "test_df.drop('ID', axis=1, inplace=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Replace 999 with Nan\n",
+    "missing_values_index = np.where(test_df == 999)\n",
+    "new_df = test_df.replace(999, np.NaN)\n",
+    "\n",
+    "# И IterativeImputer\n",
+    "multivariate_imp = IterativeImputer(random_state=42)\n",
+    "multi_imputed_array = multivariate_imp.fit_transform(new_df)\n",
+    "\n",
+    "# Round imputed values\n",
+    "for row, col in zip(*missing_values_index):\n",
+    "    multi_imputed_array[row, col] = np.round(multi_imputed_array[row, col])\n",
+    "\n",
+    "# Create a DataFrame from the imputed array, with the columns and index of original dataframe\n",
+    "multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=test_df.columns)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Feature which we found using feature selection in the training dataset\n",
+    "feature_selection_list = ['original_shape_Maximum2DDiameterColumn', 'original_firstorder_90Percentile', 'original_glcm_JointEntropy', 'original_glcm_Imc1', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'original_glrlm_RunPercentage', 'original_firstorder_Variance', 'ChemoGrade', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterSlice', 'TumourStage', 'original_shape_Sphericity', 'original_glszm_SizeZoneNonUniformity', 'original_firstorder_Range', 'original_glcm_SumEntropy', 'original_firstorder_RootMeanSquared', 'original_shape_Maximum2DDiameterRow', 'original_glcm_JointEnergy', 'Gene', 'original_gldm_DependenceNonUniformityNormalized', 'original_glszm_SmallAreaHighGrayLevelEmphasis', 'original_shape_Maximum3DDiameter', 'original_firstorder_MeanAbsoluteDeviation', 'original_shape_MinorAxisLength', 'original_glszm_ZoneEntropy', 'original_glcm_MaximumProbability', 'original_firstorder_10Percentile', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_firstorder_Maximum', 'original_glszm_SizeZoneNonUniformityNormalized', 'ER', 'original_firstorder_Kurtosis', 'HER2', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_shape_MajorAxisLength', 'original_shape_Elongation', 'original_glszm_LowGrayLevelZoneEmphasis', 'Age', 'original_glcm_SumSquares', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_firstorder_InterquartileRange']\n",
+    "\n",
+    "feature_selected = multi_imputed_df[feature_selection_list]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "scaler = StandardScaler()\n",
+    "Xs_train = scaler.fit_transform(feature_selected)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "predictions = SVR.predict(Xs_train)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "target_df = pd.DataFrame({'ID': ID_data, 'RelapseFreeSurvival (outcome)': predictions})\n",
+    "target_df.to_csv('RFSPrediction.csv', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}