--- a +++ b/Regression RFS/FinalTestRFS.ipynb @@ -0,0 +1,153 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.experimental import enable_iterative_imputer\n", + "from sklearn.impute import IterativeImputer\n", + "from sklearn.preprocessing import StandardScaler\n", + "import pickle" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "test_df = pd.read_excel(\"FinalTestDataset2024.xls\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Read the RandomForestClassification model using pickle\n", + "with open(\"svr_test.pickle\", \"rb\") as f:\n", + " SVR = pickle.load(f)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Find missing values in rows\n", + "missing_values_index = np.where(test_df == 999)[0]\n", + "\n", + "# Find index where missing values are more than 4\n", + "drop_index = [\n", + " index for index in set(missing_values_index)\n", + " if (test_df.iloc[index] == 999).sum() >= 4\n", + "]\n", + "\n", + "# Drop the rows where missing values are more than 4\n", + "test_df = test_df.drop(drop_index).reset_index(drop=True)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "ID_data = test_df['ID']\n", + "\n", + "# Drop the 'ID' from test_df\n", + "test_df.drop('ID', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Replace 999 with Nan\n", + "missing_values_index = np.where(test_df == 999)\n", + "new_df = test_df.replace(999, np.NaN)\n", + "\n", + "# И IterativeImputer\n", + "multivariate_imp = IterativeImputer(random_state=42)\n", + "multi_imputed_array = multivariate_imp.fit_transform(new_df)\n", + "\n", + "# Round imputed values\n", + "for row, col in zip(*missing_values_index):\n", + " multi_imputed_array[row, col] = np.round(multi_imputed_array[row, col])\n", + "\n", + "# Create a DataFrame from the imputed array, with the columns and index of original dataframe\n", + "multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=test_df.columns)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "# Feature which we found using feature selection in the training dataset\n", + "feature_selection_list = ['original_shape_Maximum2DDiameterColumn', 'original_firstorder_90Percentile', 'original_glcm_JointEntropy', 'original_glcm_Imc1', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'original_glrlm_RunPercentage', 'original_firstorder_Variance', 'ChemoGrade', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterSlice', 'TumourStage', 'original_shape_Sphericity', 'original_glszm_SizeZoneNonUniformity', 'original_firstorder_Range', 'original_glcm_SumEntropy', 'original_firstorder_RootMeanSquared', 'original_shape_Maximum2DDiameterRow', 'original_glcm_JointEnergy', 'Gene', 'original_gldm_DependenceNonUniformityNormalized', 'original_glszm_SmallAreaHighGrayLevelEmphasis', 'original_shape_Maximum3DDiameter', 'original_firstorder_MeanAbsoluteDeviation', 'original_shape_MinorAxisLength', 'original_glszm_ZoneEntropy', 'original_glcm_MaximumProbability', 'original_firstorder_10Percentile', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_firstorder_Maximum', 'original_glszm_SizeZoneNonUniformityNormalized', 'ER', 'original_firstorder_Kurtosis', 'HER2', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_shape_MajorAxisLength', 'original_shape_Elongation', 'original_glszm_LowGrayLevelZoneEmphasis', 'Age', 'original_glcm_SumSquares', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_firstorder_InterquartileRange']\n", + "\n", + "feature_selected = multi_imputed_df[feature_selection_list]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "scaler = StandardScaler()\n", + "Xs_train = scaler.fit_transform(feature_selected)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "predictions = SVR.predict(Xs_train)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "target_df = pd.DataFrame({'ID': ID_data, 'RelapseFreeSurvival (outcome)': predictions})\n", + "target_df.to_csv('RFSPrediction.csv', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}