Switch to unified view

a b/Regression RFS/FinalTestRFS.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {},
7
   "outputs": [],
8
   "source": [
9
    "import numpy as np\n",
10
    "import pandas as pd\n",
11
    "from sklearn.experimental import enable_iterative_imputer\n",
12
    "from sklearn.impute import IterativeImputer\n",
13
    "from sklearn.preprocessing import StandardScaler\n",
14
    "import pickle"
15
   ]
16
  },
17
  {
18
   "cell_type": "code",
19
   "execution_count": 2,
20
   "metadata": {},
21
   "outputs": [],
22
   "source": [
23
    "test_df = pd.read_excel(\"FinalTestDataset2024.xls\")"
24
   ]
25
  },
26
  {
27
   "cell_type": "code",
28
   "execution_count": 3,
29
   "metadata": {},
30
   "outputs": [],
31
   "source": [
32
    "# Read the RandomForestClassification model using pickle\n",
33
    "with open(\"svr_test.pickle\", \"rb\") as f:\n",
34
    "    SVR = pickle.load(f)"
35
   ]
36
  },
37
  {
38
   "cell_type": "code",
39
   "execution_count": 4,
40
   "metadata": {},
41
   "outputs": [],
42
   "source": [
43
    "# Find missing values in rows\n",
44
    "missing_values_index = np.where(test_df == 999)[0]\n",
45
    "\n",
46
    "# Find index where missing values are more than 4\n",
47
    "drop_index = [\n",
48
    "    index for index in set(missing_values_index)\n",
49
    "    if (test_df.iloc[index] == 999).sum() >= 4\n",
50
    "]\n",
51
    "\n",
52
    "# Drop the rows where missing values are more than 4\n",
53
    "test_df = test_df.drop(drop_index).reset_index(drop=True)\n"
54
   ]
55
  },
56
  {
57
   "cell_type": "code",
58
   "execution_count": 5,
59
   "metadata": {},
60
   "outputs": [],
61
   "source": [
62
    "ID_data = test_df['ID']\n",
63
    "\n",
64
    "# Drop the 'ID' from test_df\n",
65
    "test_df.drop('ID', axis=1, inplace=True)"
66
   ]
67
  },
68
  {
69
   "cell_type": "code",
70
   "execution_count": 6,
71
   "metadata": {},
72
   "outputs": [],
73
   "source": [
74
    "# Replace 999 with Nan\n",
75
    "missing_values_index = np.where(test_df == 999)\n",
76
    "new_df = test_df.replace(999, np.NaN)\n",
77
    "\n",
78
    "# И IterativeImputer\n",
79
    "multivariate_imp = IterativeImputer(random_state=42)\n",
80
    "multi_imputed_array = multivariate_imp.fit_transform(new_df)\n",
81
    "\n",
82
    "# Round imputed values\n",
83
    "for row, col in zip(*missing_values_index):\n",
84
    "    multi_imputed_array[row, col] = np.round(multi_imputed_array[row, col])\n",
85
    "\n",
86
    "# Create a DataFrame from the imputed array, with the columns and index of original dataframe\n",
87
    "multi_imputed_df = pd.DataFrame(multi_imputed_array, columns=test_df.columns)\n"
88
   ]
89
  },
90
  {
91
   "cell_type": "code",
92
   "execution_count": 7,
93
   "metadata": {},
94
   "outputs": [],
95
   "source": [
96
    "# Feature which we found using feature selection in the training dataset\n",
97
    "feature_selection_list = ['original_shape_Maximum2DDiameterColumn', 'original_firstorder_90Percentile', 'original_glcm_JointEntropy', 'original_glcm_Imc1', 'original_gldm_SmallDependenceLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'original_glrlm_RunPercentage', 'original_firstorder_Variance', 'ChemoGrade', 'original_shape_LeastAxisLength', 'original_shape_Maximum2DDiameterSlice', 'TumourStage', 'original_shape_Sphericity', 'original_glszm_SizeZoneNonUniformity', 'original_firstorder_Range', 'original_glcm_SumEntropy', 'original_firstorder_RootMeanSquared', 'original_shape_Maximum2DDiameterRow', 'original_glcm_JointEnergy', 'Gene', 'original_gldm_DependenceNonUniformityNormalized', 'original_glszm_SmallAreaHighGrayLevelEmphasis', 'original_shape_Maximum3DDiameter', 'original_firstorder_MeanAbsoluteDeviation', 'original_shape_MinorAxisLength', 'original_glszm_ZoneEntropy', 'original_glcm_MaximumProbability', 'original_firstorder_10Percentile', 'original_gldm_LargeDependenceHighGrayLevelEmphasis', 'original_firstorder_Maximum', 'original_glszm_SizeZoneNonUniformityNormalized', 'ER', 'original_firstorder_Kurtosis', 'HER2', 'original_firstorder_RobustMeanAbsoluteDeviation', 'original_shape_MajorAxisLength', 'original_shape_Elongation', 'original_glszm_LowGrayLevelZoneEmphasis', 'Age', 'original_glcm_SumSquares', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_gldm_SmallDependenceHighGrayLevelEmphasis', 'original_firstorder_InterquartileRange']\n",
98
    "\n",
99
    "feature_selected = multi_imputed_df[feature_selection_list]\n"
100
   ]
101
  },
102
  {
103
   "cell_type": "code",
104
   "execution_count": 8,
105
   "metadata": {},
106
   "outputs": [],
107
   "source": [
108
    "scaler = StandardScaler()\n",
109
    "Xs_train = scaler.fit_transform(feature_selected)"
110
   ]
111
  },
112
  {
113
   "cell_type": "code",
114
   "execution_count": 9,
115
   "metadata": {},
116
   "outputs": [],
117
   "source": [
118
    "predictions = SVR.predict(Xs_train)\n"
119
   ]
120
  },
121
  {
122
   "cell_type": "code",
123
   "execution_count": 12,
124
   "metadata": {},
125
   "outputs": [],
126
   "source": [
127
    "target_df = pd.DataFrame({'ID': ID_data, 'RelapseFreeSurvival (outcome)': predictions})\n",
128
    "target_df.to_csv('RFSPrediction.csv', index=False)"
129
   ]
130
  }
131
 ],
132
 "metadata": {
133
  "kernelspec": {
134
   "display_name": "base",
135
   "language": "python",
136
   "name": "python3"
137
  },
138
  "language_info": {
139
   "codemirror_mode": {
140
    "name": "ipython",
141
    "version": 3
142
   },
143
   "file_extension": ".py",
144
   "mimetype": "text/x-python",
145
   "name": "python",
146
   "nbconvert_exporter": "python",
147
   "pygments_lexer": "ipython3",
148
   "version": "3.12.2"
149
  }
150
 },
151
 "nbformat": 4,
152
 "nbformat_minor": 2
153
}