154 lines (153 with data), 5.3 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.experimental import enable_iterative_imputer\n",
"from sklearn.impute import IterativeImputer\n",
"from sklearn.preprocessing import StandardScaler\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"test_df = pd.read_excel(\"FinalTestDataset2024.xls\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# Read the RandomForestClassification model using pickle\n",
"with open(\"svr_test.pickle\", \"rb\") as f:\n",
" SVR = pickle.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# Find missing values in rows\n",
"missing_values_index = np.where(test_df == 999)[0]\n",
"\n",
"# Find index where missing values are more than 4\n",
"drop_index = [\n",
" index for index in set(missing_values_index)\n",
" if (test_df.iloc[index] == 999).sum() >= 4\n",
"]\n",
"\n",
"# Drop the rows where missing values are more than 4\n",
"test_df = test_df.drop(drop_index).reset_index(drop=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"ID_data = test_df['ID']\n",
"\n",
"# Drop the 'ID' from test_df\n",
"test_df.drop('ID', axis=1, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"# Replace 999 with Nan\n",
"missing_values_index = np.where(test_df == 999)\n",
"new_df = test_df.replace(999, np.NaN)\n",
"\n",
"# И IterativeImputer\n",
"multivariate_imp = IterativeImputer(random_state=42)\n",
"multi_imputed_array = multivariate_imp.fit_transform(new_df)\n",
"\n",
"# Round imputed values\n",
"for row, col in zip(*missing_values_index):\n",
" multi_imputed_array[row, col] = np.round(multi_imputed_array[row, col])\n",
"\n",
"# Create a DataFrame from the imputed array, with the columns and index of original dataframe\n",
"multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=test_df.columns)\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# Feature which we found using feature selection in the training dataset\n",
"feature_selection_list = ['original_shape_Maximum2DDiameterColumn', 'original_firstorder_90Percentile', 'original_glcm_JointEntropy', 'original_glcm_Imc1', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'original_glrlm_RunPercentage', 'original_firstorder_Variance', 'ChemoGrade', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterSlice', 'TumourStage', 'original_shape_Sphericity', 'original_glszm_SizeZoneNonUniformity', 'original_firstorder_Range', 'original_glcm_SumEntropy', 'original_firstorder_RootMeanSquared', 'original_shape_Maximum2DDiameterRow', 'original_glcm_JointEnergy', 'Gene', 'original_gldm_DependenceNonUniformityNormalized', 'original_glszm_SmallAreaHighGrayLevelEmphasis', 'original_shape_Maximum3DDiameter', 'original_firstorder_MeanAbsoluteDeviation', 'original_shape_MinorAxisLength', 'original_glszm_ZoneEntropy', 'original_glcm_MaximumProbability', 'original_firstorder_10Percentile', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_firstorder_Maximum', 'original_glszm_SizeZoneNonUniformityNormalized', 'ER', 'original_firstorder_Kurtosis', 'HER2', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_shape_MajorAxisLength', 'original_shape_Elongation', 'original_glszm_LowGrayLevelZoneEmphasis', 'Age', 'original_glcm_SumSquares', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_firstorder_InterquartileRange']\n",
"\n",
"feature_selected = multi_imputed_df[feature_selection_list]\n"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"scaler = StandardScaler()\n",
"Xs_train = scaler.fit_transform(feature_selected)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"predictions = SVR.predict(Xs_train)\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"target_df = pd.DataFrame({'ID': ID_data, 'RelapseFreeSurvival (outcome)': predictions})\n",
"target_df.to_csv('RFSPrediction.csv', index=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}