|
a |
|
b/project.ipynb |
|
|
1 |
{ |
|
|
2 |
"cells": [ |
|
|
3 |
{ |
|
|
4 |
"attachments": {}, |
|
|
5 |
"cell_type": "markdown", |
|
|
6 |
"metadata": {}, |
|
|
7 |
"source": [ |
|
|
8 |
"### Precision Medicine using Machine Learning " |
|
|
9 |
] |
|
|
10 |
}, |
|
|
11 |
{ |
|
|
12 |
"attachments": {}, |
|
|
13 |
"cell_type": "markdown", |
|
|
14 |
"metadata": {}, |
|
|
15 |
"source": [ |
|
|
16 |
"Once sequenced, a cancer tumor can have thousands of genetic mutations. The time-consuming challenging part is to classify those mutations as tumor growth(drivers) or neutral mutations(passengers). This is a time-consuming and challenging task currently done manually by clinicans based on text-based clinical literature. \n", |
|
|
17 |
"\n", |
|
|
18 |
"This project uses an expert-annotated knowledge base released by Memorial Sloan Kettering Cancer Center (MSKCC) containing thousands of manually annotated mutations by onclogists. \n", |
|
|
19 |
"\n", |
|
|
20 |
"Objective: We need your help to develop a Machine Learning algorithm that, using this knowledge base as a baseline, automatically classifies genetic variations." |
|
|
21 |
] |
|
|
22 |
}, |
|
|
23 |
{ |
|
|
24 |
"cell_type": "code", |
|
|
25 |
"execution_count": 1, |
|
|
26 |
"metadata": {}, |
|
|
27 |
"outputs": [], |
|
|
28 |
"source": [ |
|
|
29 |
"import numpy as np \n", |
|
|
30 |
"import pandas as pd\n", |
|
|
31 |
"import seaborn as sns\n", |
|
|
32 |
"import matplotlib.pyplot as plt\n", |
|
|
33 |
"import seaborn as sns\n", |
|
|
34 |
"import re\n", |
|
|
35 |
"#import warnings \n", |
|
|
36 |
"import math\n", |
|
|
37 |
"import nltk\n", |
|
|
38 |
"from nltk.corpus import stopwords" |
|
|
39 |
] |
|
|
40 |
}, |
|
|
41 |
{ |
|
|
42 |
"cell_type": "markdown", |
|
|
43 |
"metadata": {}, |
|
|
44 |
"source": [ |
|
|
45 |
"Step 1: Loading Data " |
|
|
46 |
] |
|
|
47 |
}, |
|
|
48 |
{ |
|
|
49 |
"cell_type": "code", |
|
|
50 |
"execution_count": 2, |
|
|
51 |
"metadata": {}, |
|
|
52 |
"outputs": [ |
|
|
53 |
{ |
|
|
54 |
"data": { |
|
|
55 |
"text/html": [ |
|
|
56 |
"<div>\n", |
|
|
57 |
"<style scoped>\n", |
|
|
58 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
59 |
" vertical-align: middle;\n", |
|
|
60 |
" }\n", |
|
|
61 |
"\n", |
|
|
62 |
" .dataframe tbody tr th {\n", |
|
|
63 |
" vertical-align: top;\n", |
|
|
64 |
" }\n", |
|
|
65 |
"\n", |
|
|
66 |
" .dataframe thead th {\n", |
|
|
67 |
" text-align: right;\n", |
|
|
68 |
" }\n", |
|
|
69 |
"</style>\n", |
|
|
70 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
71 |
" <thead>\n", |
|
|
72 |
" <tr style=\"text-align: right;\">\n", |
|
|
73 |
" <th></th>\n", |
|
|
74 |
" <th>ID</th>\n", |
|
|
75 |
" <th>Gene</th>\n", |
|
|
76 |
" <th>Variation</th>\n", |
|
|
77 |
" <th>Class</th>\n", |
|
|
78 |
" </tr>\n", |
|
|
79 |
" </thead>\n", |
|
|
80 |
" <tbody>\n", |
|
|
81 |
" <tr>\n", |
|
|
82 |
" <th>0</th>\n", |
|
|
83 |
" <td>0</td>\n", |
|
|
84 |
" <td>FAM58A</td>\n", |
|
|
85 |
" <td>Truncating Mutations</td>\n", |
|
|
86 |
" <td>1</td>\n", |
|
|
87 |
" </tr>\n", |
|
|
88 |
" <tr>\n", |
|
|
89 |
" <th>1</th>\n", |
|
|
90 |
" <td>1</td>\n", |
|
|
91 |
" <td>CBL</td>\n", |
|
|
92 |
" <td>W802*</td>\n", |
|
|
93 |
" <td>2</td>\n", |
|
|
94 |
" </tr>\n", |
|
|
95 |
" <tr>\n", |
|
|
96 |
" <th>2</th>\n", |
|
|
97 |
" <td>2</td>\n", |
|
|
98 |
" <td>CBL</td>\n", |
|
|
99 |
" <td>Q249E</td>\n", |
|
|
100 |
" <td>2</td>\n", |
|
|
101 |
" </tr>\n", |
|
|
102 |
" <tr>\n", |
|
|
103 |
" <th>3</th>\n", |
|
|
104 |
" <td>3</td>\n", |
|
|
105 |
" <td>CBL</td>\n", |
|
|
106 |
" <td>N454D</td>\n", |
|
|
107 |
" <td>3</td>\n", |
|
|
108 |
" </tr>\n", |
|
|
109 |
" <tr>\n", |
|
|
110 |
" <th>4</th>\n", |
|
|
111 |
" <td>4</td>\n", |
|
|
112 |
" <td>CBL</td>\n", |
|
|
113 |
" <td>L399V</td>\n", |
|
|
114 |
" <td>4</td>\n", |
|
|
115 |
" </tr>\n", |
|
|
116 |
" </tbody>\n", |
|
|
117 |
"</table>\n", |
|
|
118 |
"</div>" |
|
|
119 |
], |
|
|
120 |
"text/plain": [ |
|
|
121 |
" ID Gene Variation Class\n", |
|
|
122 |
"0 0 FAM58A Truncating Mutations 1\n", |
|
|
123 |
"1 1 CBL W802* 2\n", |
|
|
124 |
"2 2 CBL Q249E 2\n", |
|
|
125 |
"3 3 CBL N454D 3\n", |
|
|
126 |
"4 4 CBL L399V 4" |
|
|
127 |
] |
|
|
128 |
}, |
|
|
129 |
"execution_count": 2, |
|
|
130 |
"metadata": {}, |
|
|
131 |
"output_type": "execute_result" |
|
|
132 |
} |
|
|
133 |
], |
|
|
134 |
"source": [ |
|
|
135 |
"train_model_variants = pd.read_csv(r\"G:\\Shared drives\\Hack(Her)thon 2023 Project\\data\\training_variants\\training_variants\")\n", |
|
|
136 |
"train_model_variants.head()\n" |
|
|
137 |
] |
|
|
138 |
}, |
|
|
139 |
{ |
|
|
140 |
"cell_type": "code", |
|
|
141 |
"execution_count": 3, |
|
|
142 |
"metadata": {}, |
|
|
143 |
"outputs": [ |
|
|
144 |
{ |
|
|
145 |
"data": { |
|
|
146 |
"text/plain": [ |
|
|
147 |
"(3321, 4)" |
|
|
148 |
] |
|
|
149 |
}, |
|
|
150 |
"execution_count": 3, |
|
|
151 |
"metadata": {}, |
|
|
152 |
"output_type": "execute_result" |
|
|
153 |
} |
|
|
154 |
], |
|
|
155 |
"source": [ |
|
|
156 |
"train_model_variants.shape" |
|
|
157 |
] |
|
|
158 |
}, |
|
|
159 |
{ |
|
|
160 |
"cell_type": "code", |
|
|
161 |
"execution_count": 4, |
|
|
162 |
"metadata": {}, |
|
|
163 |
"outputs": [ |
|
|
164 |
{ |
|
|
165 |
"data": { |
|
|
166 |
"text/html": [ |
|
|
167 |
"<div>\n", |
|
|
168 |
"<style scoped>\n", |
|
|
169 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
170 |
" vertical-align: middle;\n", |
|
|
171 |
" }\n", |
|
|
172 |
"\n", |
|
|
173 |
" .dataframe tbody tr th {\n", |
|
|
174 |
" vertical-align: top;\n", |
|
|
175 |
" }\n", |
|
|
176 |
"\n", |
|
|
177 |
" .dataframe thead th {\n", |
|
|
178 |
" text-align: right;\n", |
|
|
179 |
" }\n", |
|
|
180 |
"</style>\n", |
|
|
181 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
182 |
" <thead>\n", |
|
|
183 |
" <tr style=\"text-align: right;\">\n", |
|
|
184 |
" <th></th>\n", |
|
|
185 |
" <th>ID</th>\n", |
|
|
186 |
" <th>Gene</th>\n", |
|
|
187 |
" <th>Variation</th>\n", |
|
|
188 |
" </tr>\n", |
|
|
189 |
" </thead>\n", |
|
|
190 |
" <tbody>\n", |
|
|
191 |
" <tr>\n", |
|
|
192 |
" <th>0</th>\n", |
|
|
193 |
" <td>0</td>\n", |
|
|
194 |
" <td>ACSL4</td>\n", |
|
|
195 |
" <td>R570S</td>\n", |
|
|
196 |
" </tr>\n", |
|
|
197 |
" <tr>\n", |
|
|
198 |
" <th>1</th>\n", |
|
|
199 |
" <td>1</td>\n", |
|
|
200 |
" <td>NAGLU</td>\n", |
|
|
201 |
" <td>P521L</td>\n", |
|
|
202 |
" </tr>\n", |
|
|
203 |
" <tr>\n", |
|
|
204 |
" <th>2</th>\n", |
|
|
205 |
" <td>2</td>\n", |
|
|
206 |
" <td>PAH</td>\n", |
|
|
207 |
" <td>L333F</td>\n", |
|
|
208 |
" </tr>\n", |
|
|
209 |
" <tr>\n", |
|
|
210 |
" <th>3</th>\n", |
|
|
211 |
" <td>3</td>\n", |
|
|
212 |
" <td>ING1</td>\n", |
|
|
213 |
" <td>A148D</td>\n", |
|
|
214 |
" </tr>\n", |
|
|
215 |
" <tr>\n", |
|
|
216 |
" <th>4</th>\n", |
|
|
217 |
" <td>4</td>\n", |
|
|
218 |
" <td>TMEM216</td>\n", |
|
|
219 |
" <td>G77A</td>\n", |
|
|
220 |
" </tr>\n", |
|
|
221 |
" </tbody>\n", |
|
|
222 |
"</table>\n", |
|
|
223 |
"</div>" |
|
|
224 |
], |
|
|
225 |
"text/plain": [ |
|
|
226 |
" ID Gene Variation\n", |
|
|
227 |
"0 0 ACSL4 R570S\n", |
|
|
228 |
"1 1 NAGLU P521L\n", |
|
|
229 |
"2 2 PAH L333F\n", |
|
|
230 |
"3 3 ING1 A148D\n", |
|
|
231 |
"4 4 TMEM216 G77A" |
|
|
232 |
] |
|
|
233 |
}, |
|
|
234 |
"execution_count": 4, |
|
|
235 |
"metadata": {}, |
|
|
236 |
"output_type": "execute_result" |
|
|
237 |
} |
|
|
238 |
], |
|
|
239 |
"source": [ |
|
|
240 |
"test_variants = pd.read_csv(r\"G:\\Shared drives\\Hack(Her)thon 2023 Project\\data\\test_variants\\test_variants\")\n", |
|
|
241 |
"test_variants.head()" |
|
|
242 |
] |
|
|
243 |
}, |
|
|
244 |
{ |
|
|
245 |
"cell_type": "code", |
|
|
246 |
"execution_count": 5, |
|
|
247 |
"metadata": {}, |
|
|
248 |
"outputs": [ |
|
|
249 |
{ |
|
|
250 |
"data": { |
|
|
251 |
"text/plain": [ |
|
|
252 |
"(5668, 3)" |
|
|
253 |
] |
|
|
254 |
}, |
|
|
255 |
"execution_count": 5, |
|
|
256 |
"metadata": {}, |
|
|
257 |
"output_type": "execute_result" |
|
|
258 |
} |
|
|
259 |
], |
|
|
260 |
"source": [ |
|
|
261 |
"test_variants.shape" |
|
|
262 |
] |
|
|
263 |
}, |
|
|
264 |
{ |
|
|
265 |
"cell_type": "code", |
|
|
266 |
"execution_count": 6, |
|
|
267 |
"metadata": {}, |
|
|
268 |
"outputs": [ |
|
|
269 |
{ |
|
|
270 |
"data": { |
|
|
271 |
"text/html": [ |
|
|
272 |
"<div>\n", |
|
|
273 |
"<style scoped>\n", |
|
|
274 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
275 |
" vertical-align: middle;\n", |
|
|
276 |
" }\n", |
|
|
277 |
"\n", |
|
|
278 |
" .dataframe tbody tr th {\n", |
|
|
279 |
" vertical-align: top;\n", |
|
|
280 |
" }\n", |
|
|
281 |
"\n", |
|
|
282 |
" .dataframe thead th {\n", |
|
|
283 |
" text-align: right;\n", |
|
|
284 |
" }\n", |
|
|
285 |
"</style>\n", |
|
|
286 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
287 |
" <thead>\n", |
|
|
288 |
" <tr style=\"text-align: right;\">\n", |
|
|
289 |
" <th></th>\n", |
|
|
290 |
" <th>ID</th>\n", |
|
|
291 |
" <th>Text</th>\n", |
|
|
292 |
" </tr>\n", |
|
|
293 |
" </thead>\n", |
|
|
294 |
" <tbody>\n", |
|
|
295 |
" <tr>\n", |
|
|
296 |
" <th>0</th>\n", |
|
|
297 |
" <td>0</td>\n", |
|
|
298 |
" <td>Cyclin-dependent kinases (CDKs) regulate a var...</td>\n", |
|
|
299 |
" </tr>\n", |
|
|
300 |
" <tr>\n", |
|
|
301 |
" <th>1</th>\n", |
|
|
302 |
" <td>1</td>\n", |
|
|
303 |
" <td>Abstract Background Non-small cell lung canc...</td>\n", |
|
|
304 |
" </tr>\n", |
|
|
305 |
" <tr>\n", |
|
|
306 |
" <th>2</th>\n", |
|
|
307 |
" <td>2</td>\n", |
|
|
308 |
" <td>Abstract Background Non-small cell lung canc...</td>\n", |
|
|
309 |
" </tr>\n", |
|
|
310 |
" <tr>\n", |
|
|
311 |
" <th>3</th>\n", |
|
|
312 |
" <td>3</td>\n", |
|
|
313 |
" <td>Recent evidence has demonstrated that acquired...</td>\n", |
|
|
314 |
" </tr>\n", |
|
|
315 |
" <tr>\n", |
|
|
316 |
" <th>4</th>\n", |
|
|
317 |
" <td>4</td>\n", |
|
|
318 |
" <td>Oncogenic mutations in the monomeric Casitas B...</td>\n", |
|
|
319 |
" </tr>\n", |
|
|
320 |
" </tbody>\n", |
|
|
321 |
"</table>\n", |
|
|
322 |
"</div>" |
|
|
323 |
], |
|
|
324 |
"text/plain": [ |
|
|
325 |
" ID Text\n", |
|
|
326 |
"0 0 Cyclin-dependent kinases (CDKs) regulate a var...\n", |
|
|
327 |
"1 1 Abstract Background Non-small cell lung canc...\n", |
|
|
328 |
"2 2 Abstract Background Non-small cell lung canc...\n", |
|
|
329 |
"3 3 Recent evidence has demonstrated that acquired...\n", |
|
|
330 |
"4 4 Oncogenic mutations in the monomeric Casitas B..." |
|
|
331 |
] |
|
|
332 |
}, |
|
|
333 |
"execution_count": 6, |
|
|
334 |
"metadata": {}, |
|
|
335 |
"output_type": "execute_result" |
|
|
336 |
} |
|
|
337 |
], |
|
|
338 |
"source": [ |
|
|
339 |
"train_model_text = pd.read_csv(r\"G:\\Shared drives\\Hack(Her)thon 2023 Project\\data\\training_text\\training_text\", sep=\"\\|\\|\", names=[\"ID\", \"Text\"], skiprows=1, engine=\"python\", encoding=\"latin-1\")\n", |
|
|
340 |
"train_model_text.head()\n" |
|
|
341 |
] |
|
|
342 |
}, |
|
|
343 |
{ |
|
|
344 |
"cell_type": "code", |
|
|
345 |
"execution_count": 7, |
|
|
346 |
"metadata": {}, |
|
|
347 |
"outputs": [ |
|
|
348 |
{ |
|
|
349 |
"data": { |
|
|
350 |
"text/html": [ |
|
|
351 |
"<div>\n", |
|
|
352 |
"<style scoped>\n", |
|
|
353 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
354 |
" vertical-align: middle;\n", |
|
|
355 |
" }\n", |
|
|
356 |
"\n", |
|
|
357 |
" .dataframe tbody tr th {\n", |
|
|
358 |
" vertical-align: top;\n", |
|
|
359 |
" }\n", |
|
|
360 |
"\n", |
|
|
361 |
" .dataframe thead th {\n", |
|
|
362 |
" text-align: right;\n", |
|
|
363 |
" }\n", |
|
|
364 |
"</style>\n", |
|
|
365 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
366 |
" <thead>\n", |
|
|
367 |
" <tr style=\"text-align: right;\">\n", |
|
|
368 |
" <th></th>\n", |
|
|
369 |
" <th>ID</th>\n", |
|
|
370 |
" <th>Text</th>\n", |
|
|
371 |
" </tr>\n", |
|
|
372 |
" </thead>\n", |
|
|
373 |
" <tbody>\n", |
|
|
374 |
" <tr>\n", |
|
|
375 |
" <th>0</th>\n", |
|
|
376 |
" <td>0</td>\n", |
|
|
377 |
" <td>2. This mutation resulted in a myeloproliferat...</td>\n", |
|
|
378 |
" </tr>\n", |
|
|
379 |
" <tr>\n", |
|
|
380 |
" <th>1</th>\n", |
|
|
381 |
" <td>1</td>\n", |
|
|
382 |
" <td>Abstract The Large Tumor Suppressor 1 (LATS1)...</td>\n", |
|
|
383 |
" </tr>\n", |
|
|
384 |
" <tr>\n", |
|
|
385 |
" <th>2</th>\n", |
|
|
386 |
" <td>2</td>\n", |
|
|
387 |
" <td>Vascular endothelial growth factor receptor (V...</td>\n", |
|
|
388 |
" </tr>\n", |
|
|
389 |
" <tr>\n", |
|
|
390 |
" <th>3</th>\n", |
|
|
391 |
" <td>3</td>\n", |
|
|
392 |
" <td>Inflammatory myofibroblastic tumor (IMT) is a ...</td>\n", |
|
|
393 |
" </tr>\n", |
|
|
394 |
" <tr>\n", |
|
|
395 |
" <th>4</th>\n", |
|
|
396 |
" <td>4</td>\n", |
|
|
397 |
" <td>Abstract Retinoblastoma is a pediatric retina...</td>\n", |
|
|
398 |
" </tr>\n", |
|
|
399 |
" </tbody>\n", |
|
|
400 |
"</table>\n", |
|
|
401 |
"</div>" |
|
|
402 |
], |
|
|
403 |
"text/plain": [ |
|
|
404 |
" ID Text\n", |
|
|
405 |
"0 0 2. This mutation resulted in a myeloproliferat...\n", |
|
|
406 |
"1 1 Abstract The Large Tumor Suppressor 1 (LATS1)...\n", |
|
|
407 |
"2 2 Vascular endothelial growth factor receptor (V...\n", |
|
|
408 |
"3 3 Inflammatory myofibroblastic tumor (IMT) is a ...\n", |
|
|
409 |
"4 4 Abstract Retinoblastoma is a pediatric retina..." |
|
|
410 |
] |
|
|
411 |
}, |
|
|
412 |
"execution_count": 7, |
|
|
413 |
"metadata": {}, |
|
|
414 |
"output_type": "execute_result" |
|
|
415 |
} |
|
|
416 |
], |
|
|
417 |
"source": [ |
|
|
418 |
"test_text = pd.read_csv(r\"G:\\Shared drives\\Hack(Her)thon 2023 Project\\data\\test_text\\test_text\", sep=\"\\|\\|\", names=[\"ID\", \"Text\"], skiprows=1, engine=\"python\", encoding=\"latin-1\")\n", |
|
|
419 |
"test_text.head()" |
|
|
420 |
] |
|
|
421 |
}, |
|
|
422 |
{ |
|
|
423 |
"cell_type": "code", |
|
|
424 |
"execution_count": 8, |
|
|
425 |
"metadata": {}, |
|
|
426 |
"outputs": [ |
|
|
427 |
{ |
|
|
428 |
"name": "stderr", |
|
|
429 |
"output_type": "stream", |
|
|
430 |
"text": [ |
|
|
431 |
"[nltk_data] Downloading package stopwords to\n", |
|
|
432 |
"[nltk_data] C:\\Users\\chawl\\AppData\\Roaming\\nltk_data...\n", |
|
|
433 |
"[nltk_data] Package stopwords is already up-to-date!\n" |
|
|
434 |
] |
|
|
435 |
} |
|
|
436 |
], |
|
|
437 |
"source": [ |
|
|
438 |
"#right not the training_text's 'TEXT' column needs cleaning so that it can be processed to look for data that indicates cancerous class\n", |
|
|
439 |
"\n", |
|
|
440 |
"#loading stop words from nltk library \n", |
|
|
441 |
"nltk.download('stopwords')\n", |
|
|
442 |
"stop_words = set(stopwords.words('english'))\n", |
|
|
443 |
"\n", |
|
|
444 |
"def cleaning_text(full_text, index, column):\n", |
|
|
445 |
" if type(full_text) is not str:\n", |
|
|
446 |
" full_text = str(full_text)\n", |
|
|
447 |
" \n", |
|
|
448 |
" new_text = \"\"\n", |
|
|
449 |
" empty_string = ' '\n", |
|
|
450 |
" full_text = re.sub('[^a-zA-Z0-9\\n]', empty_string, full_text)\n", |
|
|
451 |
" full_text = re.sub('\\s+', empty_string, full_text)\n", |
|
|
452 |
" full_text = full_text.lower()\n", |
|
|
453 |
"\n", |
|
|
454 |
" for word in full_text.split():\n", |
|
|
455 |
" if not word in stop_words:\n", |
|
|
456 |
" new_text += word + \" \"\n", |
|
|
457 |
" \n", |
|
|
458 |
" train_model_text[column][index] = new_text" |
|
|
459 |
] |
|
|
460 |
}, |
|
|
461 |
{ |
|
|
462 |
"cell_type": "code", |
|
|
463 |
"execution_count": 9, |
|
|
464 |
"metadata": {}, |
|
|
465 |
"outputs": [ |
|
|
466 |
{ |
|
|
467 |
"name": "stderr", |
|
|
468 |
"output_type": "stream", |
|
|
469 |
"text": [ |
|
|
470 |
"C:\\Users\\chawl\\AppData\\Local\\Temp\\ipykernel_21704\\3679951068.py:21: SettingWithCopyWarning: \n", |
|
|
471 |
"A value is trying to be set on a copy of a slice from a DataFrame\n", |
|
|
472 |
"\n", |
|
|
473 |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", |
|
|
474 |
" train_model_text[column][index] = new_text\n" |
|
|
475 |
] |
|
|
476 |
}, |
|
|
477 |
{ |
|
|
478 |
"data": { |
|
|
479 |
"text/plain": [ |
|
|
480 |
"str" |
|
|
481 |
] |
|
|
482 |
}, |
|
|
483 |
"execution_count": 9, |
|
|
484 |
"metadata": {}, |
|
|
485 |
"output_type": "execute_result" |
|
|
486 |
} |
|
|
487 |
], |
|
|
488 |
"source": [ |
|
|
489 |
"#column 1109 in 'Text' created issues because it was null. And initial type float\n", |
|
|
490 |
"cleaning_text(train_model_text['Text'][1109], 1109, 'Text')\n", |
|
|
491 |
"type(train_model_text['Text'][1109])" |
|
|
492 |
] |
|
|
493 |
}, |
|
|
494 |
{ |
|
|
495 |
"cell_type": "code", |
|
|
496 |
"execution_count": 10, |
|
|
497 |
"metadata": {}, |
|
|
498 |
"outputs": [ |
|
|
499 |
{ |
|
|
500 |
"name": "stderr", |
|
|
501 |
"output_type": "stream", |
|
|
502 |
"text": [ |
|
|
503 |
"C:\\Users\\chawl\\AppData\\Local\\Temp\\ipykernel_21704\\3679951068.py:21: SettingWithCopyWarning: \n", |
|
|
504 |
"A value is trying to be set on a copy of a slice from a DataFrame\n", |
|
|
505 |
"\n", |
|
|
506 |
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", |
|
|
507 |
" train_model_text[column][index] = new_text\n" |
|
|
508 |
] |
|
|
509 |
} |
|
|
510 |
], |
|
|
511 |
"source": [ |
|
|
512 |
"#processing all the text in the training set\n", |
|
|
513 |
"\n", |
|
|
514 |
"for index, row in train_model_text.iterrows():\n", |
|
|
515 |
" cleaning_text(row['Text'], index, 'Text')" |
|
|
516 |
] |
|
|
517 |
}, |
|
|
518 |
{ |
|
|
519 |
"cell_type": "code", |
|
|
520 |
"execution_count": 11, |
|
|
521 |
"metadata": {}, |
|
|
522 |
"outputs": [ |
|
|
523 |
{ |
|
|
524 |
"data": { |
|
|
525 |
"text/html": [ |
|
|
526 |
"<div>\n", |
|
|
527 |
"<style scoped>\n", |
|
|
528 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
529 |
" vertical-align: middle;\n", |
|
|
530 |
" }\n", |
|
|
531 |
"\n", |
|
|
532 |
" .dataframe tbody tr th {\n", |
|
|
533 |
" vertical-align: top;\n", |
|
|
534 |
" }\n", |
|
|
535 |
"\n", |
|
|
536 |
" .dataframe thead th {\n", |
|
|
537 |
" text-align: right;\n", |
|
|
538 |
" }\n", |
|
|
539 |
"</style>\n", |
|
|
540 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
541 |
" <thead>\n", |
|
|
542 |
" <tr style=\"text-align: right;\">\n", |
|
|
543 |
" <th></th>\n", |
|
|
544 |
" <th>ID</th>\n", |
|
|
545 |
" <th>Gene</th>\n", |
|
|
546 |
" <th>Variation</th>\n", |
|
|
547 |
" <th>Class</th>\n", |
|
|
548 |
" <th>Text</th>\n", |
|
|
549 |
" </tr>\n", |
|
|
550 |
" </thead>\n", |
|
|
551 |
" <tbody>\n", |
|
|
552 |
" <tr>\n", |
|
|
553 |
" <th>0</th>\n", |
|
|
554 |
" <td>0</td>\n", |
|
|
555 |
" <td>FAM58A</td>\n", |
|
|
556 |
" <td>Truncating Mutations</td>\n", |
|
|
557 |
" <td>1</td>\n", |
|
|
558 |
" <td>cyclin dependent kinases cdks regulate variety...</td>\n", |
|
|
559 |
" </tr>\n", |
|
|
560 |
" <tr>\n", |
|
|
561 |
" <th>1</th>\n", |
|
|
562 |
" <td>1</td>\n", |
|
|
563 |
" <td>CBL</td>\n", |
|
|
564 |
" <td>W802*</td>\n", |
|
|
565 |
" <td>2</td>\n", |
|
|
566 |
" <td>abstract background non small cell lung cancer...</td>\n", |
|
|
567 |
" </tr>\n", |
|
|
568 |
" <tr>\n", |
|
|
569 |
" <th>2</th>\n", |
|
|
570 |
" <td>2</td>\n", |
|
|
571 |
" <td>CBL</td>\n", |
|
|
572 |
" <td>Q249E</td>\n", |
|
|
573 |
" <td>2</td>\n", |
|
|
574 |
" <td>abstract background non small cell lung cancer...</td>\n", |
|
|
575 |
" </tr>\n", |
|
|
576 |
" <tr>\n", |
|
|
577 |
" <th>3</th>\n", |
|
|
578 |
" <td>3</td>\n", |
|
|
579 |
" <td>CBL</td>\n", |
|
|
580 |
" <td>N454D</td>\n", |
|
|
581 |
" <td>3</td>\n", |
|
|
582 |
" <td>recent evidence demonstrated acquired uniparen...</td>\n", |
|
|
583 |
" </tr>\n", |
|
|
584 |
" <tr>\n", |
|
|
585 |
" <th>4</th>\n", |
|
|
586 |
" <td>4</td>\n", |
|
|
587 |
" <td>CBL</td>\n", |
|
|
588 |
" <td>L399V</td>\n", |
|
|
589 |
" <td>4</td>\n", |
|
|
590 |
" <td>oncogenic mutations monomeric casitas b lineag...</td>\n", |
|
|
591 |
" </tr>\n", |
|
|
592 |
" </tbody>\n", |
|
|
593 |
"</table>\n", |
|
|
594 |
"</div>" |
|
|
595 |
], |
|
|
596 |
"text/plain": [ |
|
|
597 |
" ID Gene Variation Class \\\n", |
|
|
598 |
"0 0 FAM58A Truncating Mutations 1 \n", |
|
|
599 |
"1 1 CBL W802* 2 \n", |
|
|
600 |
"2 2 CBL Q249E 2 \n", |
|
|
601 |
"3 3 CBL N454D 3 \n", |
|
|
602 |
"4 4 CBL L399V 4 \n", |
|
|
603 |
"\n", |
|
|
604 |
" Text \n", |
|
|
605 |
"0 cyclin dependent kinases cdks regulate variety... \n", |
|
|
606 |
"1 abstract background non small cell lung cancer... \n", |
|
|
607 |
"2 abstract background non small cell lung cancer... \n", |
|
|
608 |
"3 recent evidence demonstrated acquired uniparen... \n", |
|
|
609 |
"4 oncogenic mutations monomeric casitas b lineag... " |
|
|
610 |
] |
|
|
611 |
}, |
|
|
612 |
"execution_count": 11, |
|
|
613 |
"metadata": {}, |
|
|
614 |
"output_type": "execute_result" |
|
|
615 |
} |
|
|
616 |
], |
|
|
617 |
"source": [ |
|
|
618 |
"#Merging Training Text and Training Variants\n", |
|
|
619 |
"training_dataframe = pd.merge(train_model_variants, train_model_text, on = \"ID\")\n", |
|
|
620 |
"training_dataframe.head()\n" |
|
|
621 |
] |
|
|
622 |
}, |
|
|
623 |
{ |
|
|
624 |
"cell_type": "code", |
|
|
625 |
"execution_count": 12, |
|
|
626 |
"metadata": {}, |
|
|
627 |
"outputs": [ |
|
|
628 |
{ |
|
|
629 |
"data": { |
|
|
630 |
"text/html": [ |
|
|
631 |
"<div>\n", |
|
|
632 |
"<style scoped>\n", |
|
|
633 |
" .dataframe tbody tr th:only-of-type {\n", |
|
|
634 |
" vertical-align: middle;\n", |
|
|
635 |
" }\n", |
|
|
636 |
"\n", |
|
|
637 |
" .dataframe tbody tr th {\n", |
|
|
638 |
" vertical-align: top;\n", |
|
|
639 |
" }\n", |
|
|
640 |
"\n", |
|
|
641 |
" .dataframe thead th {\n", |
|
|
642 |
" text-align: right;\n", |
|
|
643 |
" }\n", |
|
|
644 |
"</style>\n", |
|
|
645 |
"<table border=\"1\" class=\"dataframe\">\n", |
|
|
646 |
" <thead>\n", |
|
|
647 |
" <tr style=\"text-align: right;\">\n", |
|
|
648 |
" <th></th>\n", |
|
|
649 |
" <th>ID</th>\n", |
|
|
650 |
" <th>Gene</th>\n", |
|
|
651 |
" <th>Variation</th>\n", |
|
|
652 |
" <th>Text</th>\n", |
|
|
653 |
" </tr>\n", |
|
|
654 |
" </thead>\n", |
|
|
655 |
" <tbody>\n", |
|
|
656 |
" <tr>\n", |
|
|
657 |
" <th>0</th>\n", |
|
|
658 |
" <td>0</td>\n", |
|
|
659 |
" <td>ACSL4</td>\n", |
|
|
660 |
" <td>R570S</td>\n", |
|
|
661 |
" <td>2. This mutation resulted in a myeloproliferat...</td>\n", |
|
|
662 |
" </tr>\n", |
|
|
663 |
" <tr>\n", |
|
|
664 |
" <th>1</th>\n", |
|
|
665 |
" <td>1</td>\n", |
|
|
666 |
" <td>NAGLU</td>\n", |
|
|
667 |
" <td>P521L</td>\n", |
|
|
668 |
" <td>Abstract The Large Tumor Suppressor 1 (LATS1)...</td>\n", |
|
|
669 |
" </tr>\n", |
|
|
670 |
" <tr>\n", |
|
|
671 |
" <th>2</th>\n", |
|
|
672 |
" <td>2</td>\n", |
|
|
673 |
" <td>PAH</td>\n", |
|
|
674 |
" <td>L333F</td>\n", |
|
|
675 |
" <td>Vascular endothelial growth factor receptor (V...</td>\n", |
|
|
676 |
" </tr>\n", |
|
|
677 |
" <tr>\n", |
|
|
678 |
" <th>3</th>\n", |
|
|
679 |
" <td>3</td>\n", |
|
|
680 |
" <td>ING1</td>\n", |
|
|
681 |
" <td>A148D</td>\n", |
|
|
682 |
" <td>Inflammatory myofibroblastic tumor (IMT) is a ...</td>\n", |
|
|
683 |
" </tr>\n", |
|
|
684 |
" <tr>\n", |
|
|
685 |
" <th>4</th>\n", |
|
|
686 |
" <td>4</td>\n", |
|
|
687 |
" <td>TMEM216</td>\n", |
|
|
688 |
" <td>G77A</td>\n", |
|
|
689 |
" <td>Abstract Retinoblastoma is a pediatric retina...</td>\n", |
|
|
690 |
" </tr>\n", |
|
|
691 |
" </tbody>\n", |
|
|
692 |
"</table>\n", |
|
|
693 |
"</div>" |
|
|
694 |
], |
|
|
695 |
"text/plain": [ |
|
|
696 |
" ID Gene Variation Text\n", |
|
|
697 |
"0 0 ACSL4 R570S 2. This mutation resulted in a myeloproliferat...\n", |
|
|
698 |
"1 1 NAGLU P521L Abstract The Large Tumor Suppressor 1 (LATS1)...\n", |
|
|
699 |
"2 2 PAH L333F Vascular endothelial growth factor receptor (V...\n", |
|
|
700 |
"3 3 ING1 A148D Inflammatory myofibroblastic tumor (IMT) is a ...\n", |
|
|
701 |
"4 4 TMEM216 G77A Abstract Retinoblastoma is a pediatric retina..." |
|
|
702 |
] |
|
|
703 |
}, |
|
|
704 |
"execution_count": 12, |
|
|
705 |
"metadata": {}, |
|
|
706 |
"output_type": "execute_result" |
|
|
707 |
} |
|
|
708 |
], |
|
|
709 |
"source": [ |
|
|
710 |
"test_dataframe = pd.merge(test_variants, test_text, on = \"ID\")\n", |
|
|
711 |
"test_dataframe.head()" |
|
|
712 |
] |
|
|
713 |
}, |
|
|
714 |
{ |
|
|
715 |
"cell_type": "code", |
|
|
716 |
"execution_count": 13, |
|
|
717 |
"metadata": {}, |
|
|
718 |
"outputs": [ |
|
|
719 |
{ |
|
|
720 |
"name": "stderr", |
|
|
721 |
"output_type": "stream", |
|
|
722 |
"text": [ |
|
|
723 |
"C:\\Users\\chawl\\AppData\\Local\\Temp\\ipykernel_21704\\736478266.py:2: FutureWarning: The default value of regex will change from True to False in a future version.\n", |
|
|
724 |
" training_dataframe.Variation= training_dataframe.Variation.str.replace('\\s+', '_')\n" |
|
|
725 |
] |
|
|
726 |
} |
|
|
727 |
], |
|
|
728 |
"source": [ |
|
|
729 |
"#fixing the variation column \n", |
|
|
730 |
"training_dataframe.Variation= training_dataframe.Variation.str.replace('\\s+', '_')" |
|
|
731 |
] |
|
|
732 |
}, |
|
|
733 |
{ |
|
|
734 |
"attachments": {}, |
|
|
735 |
"cell_type": "markdown", |
|
|
736 |
"metadata": {}, |
|
|
737 |
"source": [ |
|
|
738 |
"Step 2: Splitting the dataset for Training, Validation and Testing" |
|
|
739 |
] |
|
|
740 |
}, |
|
|
741 |
{ |
|
|
742 |
"cell_type": "code", |
|
|
743 |
"execution_count": 14, |
|
|
744 |
"metadata": {}, |
|
|
745 |
"outputs": [], |
|
|
746 |
"source": [ |
|
|
747 |
"from sklearn.model_selection import train_test_split" |
|
|
748 |
] |
|
|
749 |
}, |
|
|
750 |
{ |
|
|
751 |
"cell_type": "code", |
|
|
752 |
"execution_count": 15, |
|
|
753 |
"metadata": {}, |
|
|
754 |
"outputs": [], |
|
|
755 |
"source": [ |
|
|
756 |
"#split the class column from the dataframe since we are trying to predict it\n", |
|
|
757 |
"X = training_dataframe.copy()\n", |
|
|
758 |
"y = training_dataframe['Class'].values\n", |
|
|
759 |
"\n", |
|
|
760 |
"X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size = 0.8)\n", |
|
|
761 |
"\n", |
|
|
762 |
"X_vaildation, X_test, y_validation, y_test = train_test_split(X_rem, y_rem, test_size=0.5)" |
|
|
763 |
] |
|
|
764 |
}, |
|
|
765 |
{ |
|
|
766 |
"cell_type": "code", |
|
|
767 |
"execution_count": 16, |
|
|
768 |
"metadata": {}, |
|
|
769 |
"outputs": [ |
|
|
770 |
{ |
|
|
771 |
"name": "stdout", |
|
|
772 |
"output_type": "stream", |
|
|
773 |
"text": [ |
|
|
774 |
"Number of records in training set: (2656, 5)\n", |
|
|
775 |
"Number of records in Cross validation set: (332, 5)\n", |
|
|
776 |
"Number of records in testing set: (333, 5)\n" |
|
|
777 |
] |
|
|
778 |
} |
|
|
779 |
], |
|
|
780 |
"source": [ |
|
|
781 |
"print(\"Number of records in training set:\", X_train.shape)\n", |
|
|
782 |
"print(\"Number of records in Cross validation set:\", X_vaildation.shape)\n", |
|
|
783 |
"print(\"Number of records in testing set:\", X_test.shape)\n" |
|
|
784 |
] |
|
|
785 |
}, |
|
|
786 |
{ |
|
|
787 |
"cell_type": "code", |
|
|
788 |
"execution_count": 17, |
|
|
789 |
"metadata": {}, |
|
|
790 |
"outputs": [ |
|
|
791 |
{ |
|
|
792 |
"name": "stdout", |
|
|
793 |
"output_type": "stream", |
|
|
794 |
"text": [ |
|
|
795 |
"Counter({7: 768, 4: 549, 1: 443, 2: 361, 6: 219, 5: 196, 3: 77, 9: 27, 8: 16})\n" |
|
|
796 |
] |
|
|
797 |
} |
|
|
798 |
], |
|
|
799 |
"source": [ |
|
|
800 |
"import collections\n", |
|
|
801 |
"from collections import Counter\n", |
|
|
802 |
"z_train = y_train.tolist()\n", |
|
|
803 |
"d_train = Counter(z_train)\n", |
|
|
804 |
"print(d_train)" |
|
|
805 |
] |
|
|
806 |
}, |
|
|
807 |
{ |
|
|
808 |
"cell_type": "code", |
|
|
809 |
"execution_count": 18, |
|
|
810 |
"metadata": {}, |
|
|
811 |
"outputs": [ |
|
|
812 |
{ |
|
|
813 |
"name": "stdout", |
|
|
814 |
"output_type": "stream", |
|
|
815 |
"text": [ |
|
|
816 |
"Counter({7: 28.91566265060241, 4: 20.670180722891565, 1: 16.67921686746988, 2: 13.591867469879517, 6: 8.245481927710843, 5: 7.379518072289157, 3: 2.8990963855421685, 9: 1.016566265060241, 8: 0.6024096385542169})\n", |
|
|
817 |
"<class 'collections.Counter'>\n", |
|
|
818 |
"Counter({7: 29.518072289156628, 1: 20.783132530120483, 4: 18.97590361445783, 2: 13.55421686746988, 5: 7.530120481927711, 6: 6.927710843373494, 3: 1.8072289156626506, 9: 0.6024096385542169, 8: 0.30120481927710846})\n", |
|
|
819 |
"Counter({7: 26.126126126126128, 4: 22.22222222222222, 1: 16.816816816816818, 2: 13.813813813813814, 6: 9.90990990990991, 5: 6.306306306306307, 9: 2.4024024024024024, 3: 1.8018018018018018, 8: 0.6006006006006006})\n" |
|
|
820 |
] |
|
|
821 |
} |
|
|
822 |
], |
|
|
823 |
"source": [ |
|
|
824 |
"\n", |
|
|
825 |
"s = sum(d_train.values())\n", |
|
|
826 |
"for k,v in d_train.items():\n", |
|
|
827 |
" pct = v* 100.0/s\n", |
|
|
828 |
" d_train[k] = pct\n", |
|
|
829 |
"print(d_train)\n", |
|
|
830 |
"print(type(d_train))\n", |
|
|
831 |
"#print(d.values())\n", |
|
|
832 |
"\n", |
|
|
833 |
"z_cv = y_validation.tolist()\n", |
|
|
834 |
"d_cv = Counter(z_cv)\n", |
|
|
835 |
"s2 = sum(d_cv.values())\n", |
|
|
836 |
"for k,v in d_cv.items():\n", |
|
|
837 |
" pct = v* 100.0/s2\n", |
|
|
838 |
" d_cv[k] = pct\n", |
|
|
839 |
"print(d_cv)\n", |
|
|
840 |
"\n", |
|
|
841 |
"\n", |
|
|
842 |
"z_testing = y_test.tolist()\n", |
|
|
843 |
"d_testing = Counter(z_testing)\n", |
|
|
844 |
"\n", |
|
|
845 |
"s3 = sum(d_testing.values())\n", |
|
|
846 |
"for k,v in d_testing.items():\n", |
|
|
847 |
" pct = v* 100.0/s3\n", |
|
|
848 |
" d_testing[k] = pct\n", |
|
|
849 |
"print(d_testing)" |
|
|
850 |
] |
|
|
851 |
}, |
|
|
852 |
{ |
|
|
853 |
"cell_type": "code", |
|
|
854 |
"execution_count": 19, |
|
|
855 |
"metadata": {}, |
|
|
856 |
"outputs": [ |
|
|
857 |
{ |
|
|
858 |
"data": { |
|
|
859 |
"image/png": "", |
|
|
860 |
"text/plain": [ |
|
|
861 |
"<Figure size 300x300 with 1 Axes>" |
|
|
862 |
] |
|
|
863 |
}, |
|
|
864 |
"metadata": {}, |
|
|
865 |
"output_type": "display_data" |
|
|
866 |
}, |
|
|
867 |
{ |
|
|
868 |
"data": { |
|
|
869 |
"image/png": "", |
|
|
870 |
"text/plain": [ |
|
|
871 |
"<Figure size 300x300 with 1 Axes>" |
|
|
872 |
] |
|
|
873 |
}, |
|
|
874 |
"metadata": {}, |
|
|
875 |
"output_type": "display_data" |
|
|
876 |
}, |
|
|
877 |
{ |
|
|
878 |
"data": { |
|
|
879 |
"image/png": "", |
|
|
880 |
"text/plain": [ |
|
|
881 |
"<Figure size 300x300 with 1 Axes>" |
|
|
882 |
] |
|
|
883 |
}, |
|
|
884 |
"metadata": {}, |
|
|
885 |
"output_type": "display_data" |
|
|
886 |
} |
|
|
887 |
], |
|
|
888 |
"source": [ |
|
|
889 |
"#Distribution of classes across all 3 datsets: Training, cross validation and testing should be fair \n", |
|
|
890 |
"\n", |
|
|
891 |
"#1. training\n", |
|
|
892 |
"fig_train = plt.figure(figsize=(3,3))\n", |
|
|
893 |
"plt.bar(d_train.keys(), d_train.values(), color = 'pink', width = 0.5)\n", |
|
|
894 |
"plt.xlabel(\"Classes\")\n", |
|
|
895 |
"plt.ylabel(\"Total cases\")\n", |
|
|
896 |
"plt.title(\"Distribution of classes in the Training DataSet\")\n", |
|
|
897 |
"plt.show()\n", |
|
|
898 |
"\n", |
|
|
899 |
"#2. cross-validation\n", |
|
|
900 |
"fig_cv = plt.figure(figsize=(3,3))\n", |
|
|
901 |
"plt.bar(d_cv.keys(), d_cv.values(), color = 'pink', width = 0.5)\n", |
|
|
902 |
"plt.xlabel(\"Classes\")\n", |
|
|
903 |
"plt.ylabel(\"Total cases\")\n", |
|
|
904 |
"plt.title(\"Distribution of classes in the Cross-Validation DataSet\")\n", |
|
|
905 |
"plt.show()\n", |
|
|
906 |
"\n", |
|
|
907 |
"#3. testing\n", |
|
|
908 |
"fig = plt.figure(figsize=(3,3))\n", |
|
|
909 |
"plt.bar(d_testing.keys(), d_testing.values(), color = 'pink', width = 0.5)\n", |
|
|
910 |
"plt.xlabel(\"Classes\")\n", |
|
|
911 |
"plt.ylabel(\"Total cases\")\n", |
|
|
912 |
"plt.title(\"Distribution of classes in the Testing DataSet\")\n", |
|
|
913 |
"plt.show()\n" |
|
|
914 |
] |
|
|
915 |
}, |
|
|
916 |
{ |
|
|
917 |
"attachments": {}, |
|
|
918 |
"cell_type": "markdown", |
|
|
919 |
"metadata": {}, |
|
|
920 |
"source": [ |
|
|
921 |
"Setting performance Standards for the model: Log Loss\n", |
|
|
922 |
"\n", |
|
|
923 |
"For a multi-classification problem where each class has an equal probability of being classified, \n", |
|
|
924 |
"log_loss = -log(1/M) where M is the number of classes\n", |
|
|
925 |
"-log(1/9) = 0.954\n", |
|
|
926 |
"\n", |
|
|
927 |
"But that is not the case here. As we saw from the graphs above, all 3 dataset are unequally distributed over classes. \n", |
|
|
928 |
"So, how do we calculate log-loss for imbalanced multi-class problem.\n", |
|
|
929 |
"\n", |
|
|
930 |
"By my calculation, the random guessing log-loss for this imbalanced multi class(K=9) problem comes out as 1.829\n", |
|
|
931 |
"Pictures of calculation on github.\n", |
|
|
932 |
"\n", |
|
|
933 |
"Therefore, our model has to do better than this i.e. logloss<1.829" |
|
|
934 |
] |
|
|
935 |
}, |
|
|
936 |
{ |
|
|
937 |
"attachments": {}, |
|
|
938 |
"cell_type": "markdown", |
|
|
939 |
"metadata": {}, |
|
|
940 |
"source": [ |
|
|
941 |
"### Univariate Analysis" |
|
|
942 |
] |
|
|
943 |
}, |
|
|
944 |
{ |
|
|
945 |
"attachments": {}, |
|
|
946 |
"cell_type": "markdown", |
|
|
947 |
"metadata": {}, |
|
|
948 |
"source": [ |
|
|
949 |
"##### Gene Feature" |
|
|
950 |
] |
|
|
951 |
}, |
|
|
952 |
{ |
|
|
953 |
"cell_type": "code", |
|
|
954 |
"execution_count": 20, |
|
|
955 |
"metadata": {}, |
|
|
956 |
"outputs": [ |
|
|
957 |
{ |
|
|
958 |
"name": "stdout", |
|
|
959 |
"output_type": "stream", |
|
|
960 |
"text": [ |
|
|
961 |
"There are 244 different categories of genes in the training data\n", |
|
|
962 |
"Gene\n", |
|
|
963 |
"BRCA1 220\n", |
|
|
964 |
"TP53 117\n", |
|
|
965 |
"EGFR 113\n", |
|
|
966 |
"PTEN 106\n", |
|
|
967 |
"BRCA2 89\n", |
|
|
968 |
" ... \n", |
|
|
969 |
"LATS1 1\n", |
|
|
970 |
"KLF4 1\n", |
|
|
971 |
"BCL2L11 1\n", |
|
|
972 |
"KDM6A 1\n", |
|
|
973 |
"FLT1 1\n", |
|
|
974 |
"Length: 244, dtype: int64\n" |
|
|
975 |
] |
|
|
976 |
} |
|
|
977 |
], |
|
|
978 |
"source": [ |
|
|
979 |
"#unique, number of occuerences of each gene, cumulative distribution \n", |
|
|
980 |
"unique_genes =X_train.value_counts('Gene') \n", |
|
|
981 |
"print(\"There are\", unique_genes.shape[0], \"different categories of genes in the training data\" )\n", |
|
|
982 |
"print(unique_genes)" |
|
|
983 |
] |
|
|
984 |
}, |
|
|
985 |
{ |
|
|
986 |
"cell_type": "code", |
|
|
987 |
"execution_count": 21, |
|
|
988 |
"metadata": {}, |
|
|
989 |
"outputs": [ |
|
|
990 |
{ |
|
|
991 |
"data": { |
|
|
992 |
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAkQAAAGwCAYAAABIC3rIAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAA9hAAAPYQGoP6dpAABVY0lEQVR4nO3deVxU5f4H8M+ZYdjBDWRRQHBFcYUsUDRLMc00q5upN/O63Eu4Qpaau97U0ojMrQwzu17ld1NbDBXUJBXLDXPDDVFcIARFNoVh5vz+QCanAWVw5sw483m/Xr6uc+aZM995mIufnuc5zxFEURRBREREZMVkpi6AiIiIyNQYiIiIiMjqMRARERGR1WMgIiIiIqvHQERERERWj4GIiIiIrB4DEREREVk9G1MXYI7UajVu3LgBFxcXCIJg6nKIiIioFkRRRFFREby9vSGT6Tfmw0BUjRs3bsDHx8fUZRAREVEdXL16FU2bNtXrNQxE1XBxcQFQ2aGurq4GPbdSqURSUhIiIiKgUCgMem6qHvtcWuxv6bHPpcc+l15t+rywsBA+Pj6af8f1wUBUjappMldXV6MEIkdHR7i6uvL/RBJhn0uL/S099rn02OfS06fP67LchYuqiYiIyOoxEBEREZHVYyAiIiIiq8c1REREFkSlUkGpVJq6DIunVCphY2ODe/fuQaVSmbocq1DV52q12ijnZyAiIrIQf/zxB4qKikxdhlUQRRGenp64evUq96uTSFWfZ2ZmIiAgALa2tgY9PwMREZEFcHFxQWFhITw8PODo6Mh/pI1MrVajuLgYzs7Oem8ASHWjVqtRVFSEwsJCZGdnw9fX16DfcwYiIqInnEqlgouLC9zd3dGoUSNTl2MV1Go1ysvLYW9vz0Akkao+d3d3R3Z2NioqKgy65QF/ikRET7iKigrIZDI4OjqauhQio6sKQYZeu8VARET0hBNFEUDdNqMjetIY63vOQERERERWj4GIiIiIrB4DERERmb1169ahfv36pi7jiSCKIv75z3+iYcOGEAQBx48fN3VJTwQGIgmVVahwo+AubpeZuhIiIvMwcuRIvPzyyzrH9+7dC0EQUFBQAAAYMmQIzp8/X6tzWnt42rFjB9atW4dt27YhOzsbQUFB1bYTRRFr1qxBaGgoXF1d4ezsjHbt2mHSpEm4ePGixFWbHgORhE5dv4OeH+/D8tNyU5dCRPREcXBwQOPGjU1dhg5z3BU8IyMDXl5eCAsLg6enJ2xsdHfYEUURw4YNw8SJE9G/f38kJSXhxIkTWLZsGRwcHPDvf//bBJWbFgORhGT3V8YbZ9NxIqI/iaKI0vIKk/ypuurNkP466vP777+jV69ecHFxgaurK4KDg3HkyBHs3bsX//jHP3Dnzh0IggBBEDB37lwAwO3btzFixAg0aNAAjo6O6NevHy5cuKD1PmvWrIGPjw8cHR0xePBgxMbGar3v3Llz0alTJ6xduxadOnWCg4MDRFHEjh070L17d9SvXx+NGjXCgAEDkJGRoXnd5cuXIQgC/u///g/h4eFwcHDAU089hfPnz+Pw4cMICQmBs7MzXnjhBdy8efOhfZGSkoKuXbvCzs4OXl5emDZtGioqKgBUjrhNmDABWVlZEAQBzZo1q/YcCQkJ2LRpExISEjBr1iw888wzCAgIwPPPP4/Fixfjq6++0mr/1VdfITAwEPb29mjTpg1Wrlyp89m2bNmCXr16wdHRER07dsTBgwe1zpGamooePXrAwcEBPj4+mDhxIkpKSjTPr1y5Ei1btoS9vT08PDzw2muvPbQfDI0bM0pILrsfiAz/u4KISMtdpQptZ+80yXufmd8XjrbG/edl+PDh6Ny5M1atWgW5XI7jx49DoVAgLCwMcXFxmD17Ns6dOwcAcHZ2BlAZFi5cuIAffvgBrq6umDp1Kvr3748zZ85AoVDgwIEDiIyMxIcffoiBAwdi165dmDVrls57X7x4Ef/73/+wfv16uLq6AgBKSkoQExOD9u3bo6SkBLNnz8bgwYNx/PhxrY0b58yZg7i4OPj6+mLUqFEYOnQoXF1d8emnn8LR0RGvv/46Zs+ejVWrVlX7ua9fv47+/ftj5MiRWL9+Pc6ePYuxY8fC3t4ec+fOxaefformzZvjiy++wOHDhyGXVz8jsXHjRrRu3RoDBw6s9vkHL21fs2YN5syZg+XLl6Nz585IS0vD2LFj4eTkhLfeekvTbsaMGVi6dClatmyJGTNmYOjQobh48SJsbGxw8uRJ9O3bFwsWLEB8fDxu3ryJ8ePHY/z48fjqq69w5MgRTJw4Ed988w3CwsJw69Yt7Nu372FfAYNjIJJQ1QiREf7jiYjoibVt2zZNaKnyqE33srKy8O6776JNmzYAgJYtW2qeq1evHgRBgKenp+ZYVRA6cOAAwsLCAAAbNmyAj48PvvvuO/ztb3/DZ599hn79+mHKlCkAgFatWiE1NRXbtm3Teu/y8nKsX78ednZ2cHV1hSAIePXVV7XaxMfHo3Hjxjhz5ozWGp4pU6agb9++AIBJkyZh6NCh2L17N7p16wYAGD16NNatW1fj5165ciV8fHywfPlyCIKANm3a4MaNG5g6dSpmz56NevXqwcXFBXK5XOvz/9X58+fRunVrrWOTJ0/Gl19+CQCoX78+rl27BgBYsGABPv74Y7zyyisAAH9/f5w5cwaff/65ViCaMmUKXnzxRQDAvHnz0K5dO1y8eBFt2rTBkiVLMGzYMEyePBlA5c9r2bJl6NmzJ1atWoWsrCw4OTlhwIABcHFxgZ+fHzp37lxj/cbAQCQhzQiRiesgIsvnoJDjzPy+JntvffTq1UtnROS3337D3//+9xpfExMTgzFjxuCbb75B79698be//Q3NmzevsX16ejpsbGzw9NNPa441atQIrVu3Rnp6OgDg3LlzGDx4sNbrunbtqhOI/Pz84O7ujsLCQs2xjIwMzJo1C7/++ivy8vI0d2TPysrSCkQdOnTQ/N3DwwMA0L59e61jubm5D/0coaGhWiM43bp1Q3FxMa5duwZfX98aX/tXf93gcMaMGRg/fjy2bNmChQsXAgBu3ryJq1evYvTo0Rg7dqymbUVFBerVq6f1+gc/m5eXFwAgNzcXbdq0wdGjR3Hx4kVs2LBB00YURajVamRmZqJPnz7w8/NDQEAAXnjhBbzwwgsYPHiwpLuvMxBJiFNmRCQVQRCMPm1lKE5OTmjRooXWsarRiZrMnTsXw4YNw08//YTt27djzpw52LRpk06gqVLTuiZRFDXB4MG/P+x1Tk5OOsdeeukl+Pj4YM2aNfD29oZarUZQUBDKy8u12j14762q9/rrsaow9ah6/1qjPjs4t2zZEmfPntU65u7uDnd3d63F61W1rFmzRitMAtCZjqvus1W9Xq1W41//+hcmTpyoU4uvry9sbW1x7Ngx7N27F0lJSZg9ezbmzp2Lw4cPS3bFIBdVS4hTZkREhtOqVStER0cjKSkJr7zyimYhsK2trc6UW9u2bVFRUYHffvtNcyw/Px/nz59HYGAgAKBNmzY4dOiQ1uuOHDnyyDry8/ORnp6OmTNn4vnnn0dgYCBu3779uB+vWm3btkVqaqpWUEtNTYWLiwuaNGlS6/MMHToU586dw/fff//Qdh4eHmjSpAkuXbqEFi1aaP3x9/ev9ft16dIFp0+f1jlHixYtYGtrCwCwsbFB79698dFHH+HEiRO4fPky9uzZU+v3eFxPxn8+WAhOmRERPb67d+/i3XffxWuvvQZ/f39cu3YNhw8f1qzjadasGYqLi7F792507NgRjo6OaNmyJQYNGoSxY8fi888/h4uLC6ZNm4YmTZpg0KBBAIAJEyagR48eiI2NxUsvvYQ9e/Zg+/btjxx5adCgARo1aoQvvvgCXl5eyMrKwrRp04zy2aOiohAXF4cJEyZg/PjxOHfuHObMmYOYmBitxduP8sYbb2DLli144403MH36dPTt2xceHh64cuUKEhIStEZ/5s6di4kTJ8LV1RX9+vVDWVkZjhw5gtu3byMmJqZW7zd16lQ888wzGDdunGZBdnp6OpKTk/HZZ59h27ZtuHTpEnr06IEGDRogMTERarVaZ52TMXGESEJygVNmRESPSy6XIz8/HyNGjECrVq3w+uuvo1+/fpg3bx4AICwsDJGRkRgyZAjc3d3x0UcfAai8dDw4OBgDBgxAaGgoRFFEYmKiZqqnW7duWL16NWJjY9GxY0fs2LED0dHRsLe3f2g9MpkMmzZtwtGjRxEUFITo6GgsWbLEKJ+9SZMmSExMxKFDh9CxY0dERkZi9OjRmDlzpl7nEQQBCQkJiIuLQ2JiIp5//nm0bt0ao0aNgo+PD/bv369pO2bMGHz55ZdYt24d2rdvj549e2LdunV6jRB16NABKSkpuHDhAsLDw9G5c2fMmjVLs9aofv362LJlC5577jkEBgZi9erV2LhxI9q1a6fX53osoomtWLFCbNasmWhnZyd26dJF/OWXXx7afu/evWKXLl1EOzs70d/fX1y1apVOm08++URs1aqVaG9vLzZt2lScPHmyePfu3VrXdOfOHRGAeOfOHb0/z8NcvVUi+k3dJraY9qNYXl5u0HNTzcrLy8XvvvuOfS4R9rf0CgsLxSNHjoglJSWmLsXijBkzRuzevbvOcZVKJd6+fVtUqVQmqMo6VfV5SUmJeObMmWr/XX+cf79NOkKUkJCAyZMnY8aMGUhLS0N4eDj69euHrKysattnZmaif//+CA8PR1paGt5//31MnDgRmzdv1rTZsGEDpk2bhjlz5iA9PR3x8fFISEjA9OnTpfpYNeKUGRGReVu6dCl+//13XLx4EZ999hm+/vprrUvLyXKZdA1RbGwsRo8ejTFjxgAA4uLisHPnTqxatQqLFi3Sab969Wr4+voiLi4OABAYGIgjR45g6dKlmrnjgwcPolu3bhg2bBiAyrnkoUOH6iyUe1BZWRnKyv68wVjVpZRKpdKg27Kr7y/yU4vmud27parqa/a5NNjf0qvapVi8fxkz1d1vv/2Gjz76CEVFRQgICEBcXBxGjRql06/i/UXN7HPpPNjnoihCqVTqXOn2OL93TBaIysvLcfToUZ2FZxEREUhNTa32NQcPHkRERITWsb59+yI+Ph5KpRIKhQLdu3fHf/7zHxw6dAhdu3bFpUuXkJiY+NCEv2jRIs3c84OSkpIMugdCkRIAbCBCQFJSMvS4QpIMIDk52dQlWBX2t3RsbGzg6emJkpISBtHHtGbNGp1jD+439FdFRUXGLIeqUVJSgrt37+KXX37R/MdAldLS0jqf12SBKC8vDyqVSrMxVRUPDw/k5ORU+5qcnJxq21dUVCAvLw9eXl544403cPPmTXTv3h2iKKKiogJvv/32Q1f8T58+XWulfGFhIXx8fBAREaHZlt0QbpeWY+aRvQCA53v3hr2drcHOTTVTKpVITk5Gnz59tPbJIONgf0uvuLgYly5dgqOjo6Qb2VkzURRRVFQEFxcXvfb/obqr6nNHR0c4ODigZ8+esLOz02rzsPD6KCa/7L66DaYe9uV61IZUe/fuxQcffICVK1fi6aefxsWLFzFp0iR4eXlVe08aALCzs9PpVKBykylD/kJ/MP/I5Db8x0Jihv550sOxv6Xj4OAAtVqNu3fv6twCg4yjappMEAS9Lnenuqvq84qKCgiCADs7O53fMY/zO8dkgcjNzQ1yuVxnNCg3N1dnFKiKp6dnte1tbGzQqFEjAMCsWbPw5ptvatYlVd1o75///CdmzJhh0i9u1aJqAFBzd0YiMhC5XI6ioiLcvHkTMpkMjo6OHLUwMrVajfLycty7d4+BSCJqtRplZWUoLCyEo6MjbGwMG2FMFohsbW0RHByM5ORkra3Wk5OTNZtk/VVoaCh+/PFHrWNJSUkICQnRpMLS0lKdL6dcLtcswjIl+QO/oFTcjIiIDKioqAitWrV66H2wyHBEUcTdu3fh4ODA8CmRqj53cnKCl5eXwfvdpFNmMTExePPNNxESEoLQ0FB88cUXyMrKQmRkJIDKtT3Xr1/H+vXrAQCRkZFYvnw5YmJiMHbsWBw8eBDx8fHYuHGj5pwvvfQSYmNj0blzZ82U2axZszBw4ECd1ehSezCncYSIiAzNw8MDXl5eXFgtAaVSiV9++QU9evTg1LBElEolUlJS0KdPH83tPgzJpIFoyJAhyM/Px/z585GdnY2goCAkJibCz88PAJCdna21J5G/vz8SExMRHR2NFStWwNvbG8uWLdNccg8AM2fOhCAImDlzJq5fvw53d3e89NJL+OCDDyT/fH+lPUJkwkKIyGLJ5XKT/8efNZDL5aioqIC9vT0DkUTkcjlUKpXRpihNvqg6KioKUVFR1T63bt06nWM9e/bEsWPHajyfjY0N5syZgzlz5hiqRIN5cA2RiiNEREREZoMrwSQkCIJm7yE11xARERGZDQYiiVVNm3GEiIiIyHwwEElMVnU/M44QERERmQ0GIonJ70+ZcYSIiIjIfDAQSezPESITF0JEREQaDEQS06wh4pQZERGR2WAgkljVpfecMiMiIjIfDEQSkwlcVE1ERGRuGIgkxhEiIiIi88NAJDGZZmNG09ZBREREf2IgkhhHiIiIiMwPA5HEuIaIiIjI/DAQSYwjREREROaHgUhiMu5DREREZHYYiCQmv9/jao4QERERmQ0GIon9uVO1iQshIiIiDQYiiWnuZcYRIiIiIrPBQCQxzaJqriEiIiIyGwxEEuNl90REROaHgUhivOyeiIjI/DAQSazq1h2cMiMiIjIfDEQSk2sWVZu4ECIiItJgIJKYnBszEhERmR0GIonxsnsiIiLzw0AkMY4QERERmR8GIonJeOsOIiIis8NAJDHeuoOIiMj8MBBJTMZ9iIiIiMwOA5HE5NypmoiIyOwwEEmMI0RERETmh4FIYhwhIiIiMj8MRBKT3+9xjhARERGZDwYiiWk2ZuRVZkRERGaDgUhi3JiRiIjI/DAQSYyLqomIiMwPA5HEuKiaiIjI/DAQSYwjREREROaHgUhi8so8xEXVREREZoSBSGJyjhARERGZHQYiicm4hoiIiMjsMBBJjCNERERE5oeBSGIcISIiIjI/DEQS4607iIiIzA8DkcRkmp2qTVwIERERaTAQSaxqDZGaI0RERERmg4FIYjLey4yIiMjsMBBJTDNCxEBERERkNhiIJMbL7omIiMwPA5HEZLx1BxERkdlhIJIYR4iIiIjMDwORxLiomoiIyPwwEEmMl90TERGZHwYiiXGEiIiIyPwwEEms6tYdHCEiIiIyHwxEEpPz1h1ERERmh4FIYjKuISIiIjI7DEQSk3MNERERkdlhIJIYR4iIiIjMDwORxKpGiCo4QkRERGQ2GIgkJqu6yoyBiIiIyGwwEEmMt+4gIiIyPwxEEuNl90REROaHgUhimkXVnDIjIiIyGwxEEtOMEHHKjIiIyGwwEEmMi6qJiIjMDwORxDhCREREZH4YiCTGNURERETmh4FIYn+OEJm4ECIiItJgIJKYnCNEREREZoeBSGIyriEiIiIyOyYPRCtXroS/vz/s7e0RHByMffv2PbR9SkoKgoODYW9vj4CAAKxevVqnTUFBAcaNGwcvLy/Y29sjMDAQiYmJxvoIepHf73He7Z6IiMh8mDQQJSQkYPLkyZgxYwbS0tIQHh6Ofv36ISsrq9r2mZmZ6N+/P8LDw5GWlob3338fEydOxObNmzVtysvL0adPH1y+fBnffvstzp07hzVr1qBJkyZSfayHqhoh4t3uiYiIzIeNKd88NjYWo0ePxpgxYwAAcXFx2LlzJ1atWoVFixbptF+9ejV8fX0RFxcHAAgMDMSRI0ewdOlSvPrqqwCAtWvX4tatW0hNTYVCoQAA+Pn5SfOBakFzLzPeuoOIiMhs6B2IduzYAWdnZ3Tv3h0AsGLFCqxZswZt27bFihUr0KBBg1qdp7y8HEePHsW0adO0jkdERCA1NbXa1xw8eBARERFax/r27Yv4+HgolUooFAr88MMPCA0Nxbhx4/D999/D3d0dw4YNw9SpUyGXy6s9b1lZGcrKyjSPCwsLAQBKpRJKpbJWn6e21CoVgMopM0Ofm6pX1c/sb2mwv6XHPpce+1x6tenzx/l56B2I3n33XXz44YcAgJMnT+Kdd95BTEwM9uzZg5iYGHz11Ve1Ok9eXh5UKhU8PDy0jnt4eCAnJ6fa1+Tk5FTbvqKiAnl5efDy8sKlS5ewZ88eDB8+HImJibhw4QLGjRuHiooKzJ49u9rzLlq0CPPmzdM5npSUBEdHx1p9ntrKvwcANqioqDCbdU3WIjk52dQlWBX2t/TY59Jjn0vvYX1eWlpa5/PqHYgyMzPRtm1bAMDmzZsxYMAALFy4EMeOHUP//v31LkC4v6amiiiKOsce1f7B42q1Go0bN8YXX3wBuVyO4OBg3LhxA0uWLKkxEE2fPh0xMTGax4WFhfDx8UFERARcXV31/kwPk5VXBKQdhCjI0L9/X4Oem6qnVCqRnJyMPn36aKZRyXjY39Jjn0uPfS692vR51QxPXegdiGxtbTUJbNeuXRgxYgQAoGHDhnoV4ubmBrlcrjMalJubqzMKVMXT07Pa9jY2NmjUqBEAwMvLCwqFQmt6LDAwEDk5OSgvL4etra3Oee3s7GBnZ6dzXKFQGPyLbmdbeT61KPL/RBIzxs+Tasb+lh77XHrsc+k9rM8f52eh91Vm3bt3R0xMDBYsWIBDhw7hxRdfBACcP38eTZs2rfV5bG1tERwcrDP0lZycjLCwsGpfExoaqtM+KSkJISEhmk7o1q0bLl68CLX6z1XL58+fh5eXV7VhSGqajRnFP0e3iIiIyLT0DkTLly+HjY0Nvv32W6xatUpzOfv27dvxwgsv6HWumJgYfPnll1i7di3S09MRHR2NrKwsREZGAqicyqoagQKAyMhIXLlyBTExMUhPT8fatWsRHx+PKVOmaNq8/fbbyM/Px6RJk3D+/Hn89NNPWLhwIcaNG6fvRzUK2QNTftyKiIiIyDzoPWXm6+uLbdu26Rz/5JNP9H7zIUOGID8/H/Pnz0d2djaCgoKQmJiouUw+Oztba08if39/JCYmIjo6GitWrIC3tzeWLVumueQeAHx8fJCUlITo6Gh06NABTZo0waRJkzB16lS96zOGqhEioPJKswcfExERkWnUaR+ijIwMfPXVV8jIyMCnn36Kxo0bY8eOHfDx8UG7du30OldUVBSioqKqfW7dunU6x3r27Iljx4499JyhoaH49ddf9apDKtojRBwiIiIiMgd6T5mlpKSgffv2+O2337BlyxYUFxcDAE6cOIE5c+YYvEBLI3+gx3n7DiIiIvOgdyCaNm0a/v3vfyM5OVlrkXKvXr1w8OBBgxZnieQPjBDxBq9ERETmQe9AdPLkSQwePFjnuLu7O/Lz8w1SlCWTPbBmSM0RIiIiIrOgdyCqX78+srOzdY6npaWZzQ1UzZnWCBEDERERkVnQOxBV3RcsJycHgiBArVbjwIEDmDJlitYl8lS9B0eIOGVGRERkHvQORB988AF8fX3RpEkTFBcXo23btujRowfCwsIwc+ZMY9RocWSoDEJq3vGeiIjILOh92b1CocCGDRuwYMECHDt2DGq1Gp07d0bLli2NUZ9FEgQAIkeIiIiIzEWd9iECgICAAAQEBBiyFqshEwCVyEXVRERE5kLvKbPXXnsNixcv1jm+ZMkS/O1vfzNIUZauqtO5qJqIiMg81Gljxqobuj7ohRdewC+//GKQoixd1bpqTpkRERGZB70DUXFxcbV3jVcoFCgsLDRIUZau6sp7TpkRERGZB70DUVBQEBISEnSOb9q0CW3btjVIUZZOM2XGESIiIiKzoPei6lmzZuHVV19FRkYGnnvuOQDA7t27sXHjRvzvf/8zeIGWSDNlxhEiIiIis6B3IBo4cCC+++47LFy4EN9++y0cHBzQoUMH7Nq1Cz179jRGjRbnzykz09ZBRERElep02f2LL75Y7cJqqh1OmREREZmXOu9DVF5ejtzcXKj/Mszh6+v72EVZOk6ZERERmRe9A9GFCxcwatQopKamah0XRRGCIEClUhmsOEulmTLjCBEREZFZ0DsQjRw5EjY2Nti2bRu8vLwgPHD3dqodbsxIRERkXvQORMePH8fRo0fRpk0bY9RjFWTch4iIiMis6L0PUdu2bZGXl2eMWqyGwJ2qiYiIzIregejDDz/Ee++9h7179yI/Px+FhYVaf+jROGVGRERkXvSeMuvduzcA4Pnnn9c6zkXVtSfjomoiIiKzoncg+vnnn41Rh1X587J709ZBRERElfQORNyN+vFVXZfHKTMiIiLzoPcaIgDYt28f/v73vyMsLAzXr18HAHzzzTfYv3+/QYuzVJwyIyIiMi96B6LNmzejb9++cHBwwLFjx1BWVgYAKCoqwsKFCw1eoCXiTtVERETmRe9A9O9//xurV6/GmjVroFAoNMfDwsJw7NgxgxZnqQRUBiGOEBEREZkHvQPRuXPn0KNHD53jrq6uKCgoMERNFo8jREREROZF70Dk5eWFixcv6hzfv38/AgICDFKUpWMgIiIiMi96B6J//etfmDRpEn777TcIgoAbN25gw4YNmDJlCqKiooxRo8WpusqMU2ZERETmQe/L7t977z3cuXMHvXr1wr1799CjRw/Y2dlhypQpGD9+vDFqtDjch4iIiMi86BWIVCoV9u/fj3feeQczZszAmTNnoFar0bZtWzg7OxurRosj473MiIiIzIpegUgul6Nv375IT09Hw4YNERISYqy6LFrVPCXvdk9ERGQe9F5D1L59e1y6dMkYtVgNgYuqiYiIzIregeiDDz7AlClTsG3bNmRnZ/Nu93XAnaqJiIjMi96Lql944QUAwMCBAyFUDXWAd7vXBy+7JyIiMi+8270JVA3LcVE1ERGReeDd7k1AM2XGESIiIiKzoHcg+uWXXx76fHW39SBtAvchIiIiMit6B6Jnn31W59iDa4m4hujROGVGRERkXvS+yuz27dtaf3Jzc7Fjxw489dRTSEpKMkaNFodTZkREROZF7xGievXq6Rzr06cP7OzsEB0djaNHjxqkMEsmcKdqIiIis6L3CFFN3N3dce7cOUOdzqJxp2oiIiLzovcI0YkTJ7Qei6KI7OxsLF68GB07djRYYZaM+xARERGZF70DUadOnSAIAsS/TPc888wzWLt2rcEKs2ScMiMiIjIvegeizMxMrccymQzu7u6wt7c3WFGWjlNmRERE5kXvQOTn52eMOqyKjCNEREREZkXvRdUTJ07EsmXLdI4vX74ckydPNkRNFk8mVAYhbsxIRERkHvQORJs3b0a3bt10joeFheHbb781SFGWrmobS06ZERERmQe9A1F+fn61exG5uroiLy/PIEVZOk6ZERERmRe9A1GLFi2wY8cOnePbt29HQECAQYqydNypmoiIyLzovag6JiYG48ePx82bN/Hcc88BAHbv3o2PP/4YcXFxhq7PIlVNmXGEiIiIyDzoHYhGjRqFsrIyfPDBB1iwYAEAoFmzZli1ahVGjBhh8AItETdmJCIiMi96ByIAePvtt/H222/j5s2bcHBwgLOzs6HrsmiaKTOOEBEREZmFOm3MWFFRgZYtW8Ld3V1z/MKFC1AoFGjWrJkh67NImikzjhARERGZBb0XVY8cORKpqak6x3/77TeMHDnSEDVZvD+nzExbBxEREVXSOxClpaVVuw/RM888g+PHjxuiJovHKTMiIiLzoncgEgQBRUVFOsfv3LkDlUplkKIsXVWnKzlEREREZBb0DkTh4eFYtGiRVvhRqVRYtGgRunfvbtDiLFUj+8qRoVPX73AdERERkRnQe1H1Rx99hB49eqB169YIDw8HAOzbtw+FhYXYs2ePwQu0RM1cgHoONrhdqsSxrNt4qllDU5dERERk1fQeIWrbti1OnDiB119/Hbm5uSgqKsKIESNw9uxZBAUFGaNGiyMXgJ4tK6/Q23XmDxNXQ0RERHXah8jb2xsLFy40dC1W5bk27vjhRDZ2pf+B6f0DTV0OERGRVatTICooKEB8fDzS09MhCALatm2LUaNGVXvTV6pej5aNYCMTkHGzBJl5JfB3czJ1SURERFZL7ymzI0eOoHnz5vjkk09w69Yt5OXlITY2Fs2bN8exY8eMUaNFcrFX4OmAyrVDu9M5bUZERGRKegei6OhoDBw4EJcvX8aWLVuwdetWZGZmYsCAAZg8ebIRSrRc4ffXEaVlFZi2ECIiIiun95TZkSNHsGbNGtjY/PlSGxsbvPfeewgJCTFocZaujacLAODcH7r7OhEREZF09B4hcnV1RVZWls7xq1evwsXFxSBFWYvW9wNRZl4Jyiq4qSUREZGp6B2IhgwZgtGjRyMhIQFXr17FtWvXsGnTJowZMwZDhw41Ro0Wy9PVHq72NlCpRVy6WWLqcoiIiKyW3lNmS5cuhSAIGDFiBCoqKgAACoUCb7/9NhYvXmzwAi2ZIAho7emCw5dv41xOEQK9XE1dEhERkVXSOxDZ2tri008/xaJFi5CRkQFRFNGiRQs4Ojoaoz6L18rjfiDiOiIiIiKTqdM+RADg6OiI9u3bG7IWq1S1juh8DgMRERGRqei9hsjQVq5cCX9/f9jb2yM4OBj79u17aPuUlBQEBwfD3t4eAQEBWL16dY1tN23aBEEQ8PLLLxu4asNp7cErzYiIiEzNpIEoISEBkydPxowZM5CWlobw8HD069ev2qvYACAzMxP9+/dHeHg40tLS8P7772PixInYvHmzTtsrV65gypQpmhvQmqtW9wPRtdt3UVxWYeJqiIiIrFOdp8wMITY2FqNHj8aYMWMAAHFxcdi5cydWrVqFRYsW6bRfvXo1fH19ERcXBwAIDAzEkSNHsHTpUrz66quadiqVCsOHD8e8efOwb98+FBQUPLSOsrIylJWVaR4XFhYCAJRKJZRK5WN+Sm1V56v6X2dbAY1d7JBbVIYz12+js099g74f6fY5GRf7W3rsc+mxz6VXmz5/nJ9HrQJRly5dsHv3bjRo0ADz58/HlClTHnsRdXl5OY4ePYpp06ZpHY+IiEBqamq1rzl48CAiIiK0jvXt2xfx8fFQKpVQKBQAgPnz58Pd3R2jR49+5BQcACxatAjz5s3TOZ6UlGS0xeLJycmavzeQyZALGbbsOohsD9Eo70fafU7Gx/6WHvtceuxz6T2sz0tLS+t83loFovT0dJSUlKBBgwaYN28eIiMjHzso5OXlQaVSwcPDQ+u4h4cHcnJyqn1NTk5Ote0rKiqQl5cHLy8vHDhwAPHx8Th+/Hita5k+fTpiYmI0jwsLC+Hj44OIiAi4uhr2UnilUonk5GT06dNHE+D2l5/GuaPX4eXfGv2fDTDo+1H1fU7Gw/6WHvtceuxz6dWmz6tmeOqiVoGoU6dO+Mc//oHu3btDFEUsXboUzs7O1badPXu2XgUIgqD1WBRFnWOPal91vKioCH//+9+xZs0auLm51boGOzs72NnZ6RxXKBRG+6I/eO6GzpXvXVSm4v+xjMiYP0/Sxf6WHvtceuxz6T2szx/nZ1GrQLRu3TrMmTMH27ZtgyAI2L59u9a9zKoIglDrQOTm5ga5XK4zGpSbm6szClTF09Oz2vY2NjZo1KgRTp8+jcuXL+Oll17SPK9WqwFU3m/t3LlzaN68ea3qk1J9B1sAwO1SzkUTERGZQq0CUevWrbFp0yYAgEwmw+7du9G4cePHemNbW1sEBwcjOTkZgwcP1hxPTk7GoEGDqn1NaGgofvzxR61jSUlJCAkJgUKhQJs2bXDy5Emt52fOnImioiJ8+umn8PHxeayajaW+Y2WivXO33MSVEBERWSe9rzKrGnExhJiYGLz55psICQlBaGgovvjiC2RlZSEyMhJA5dqe69evY/369QCAyMhILF++HDExMRg7diwOHjyI+Ph4bNy4EQBgb2+PoKAgrfeoX78+AOgcNyf1HSoDUQFHiIiIiEyiTpfdZ2RkIC4uDunp6RAEAYGBgZg0aZLe01FDhgxBfn4+5s+fj+zsbAQFBSExMRF+fn4AgOzsbK09ifz9/ZGYmIjo6GisWLEC3t7eWLZsmdYl90+ievdHiAruMhARERGZgt6BaOfOnRg4cCA6deqEbt26QRRFpKamol27dvjxxx/Rp08fvc4XFRWFqKioap9bt26dzrGePXvi2LFjtT5/decwNw0cK9cQcYSIiIjINPQORNOmTUN0dLTOne2nTZuGqVOn6h2I6M81RAWl5Y+8yo6IiIgMT+9bd6Snp2P06NE6x0eNGoUzZ84YpChrU3WVWYVaREm5ysTVEBERWR+9A5G7u3u1mx4eP378sa88s1b2ChlsbSp/FAWlvNKMiIhIanpPmY0dOxb//Oc/cenSJYSFhUEQBOzfvx8ffvgh3nnnHWPUaPEEQUADRwX+KCxDQakSTRuYuiIiIiLroncgmjVrFlxcXPDxxx9j+vTpAABvb2/MnTsXEydONHiB1qK+gy3+KCzDHV5pRkREJDm9A5EgCIiOjkZ0dDSKiooAAC4uLgYvzNpUXXp/m1NmREREkqvTPkRVGIQMh5szEhERmY7ei6rJOP68fQcDERERkdQYiMzEn5szcsqMiIhIagxEZkJz+w5OmREREUlOr0CkVCrRq1cvnD9/3lj1WK2qzRlvMxARERFJTq9ApFAocOrUKd5awgj+XEPEKTMiIiKp6T1lNmLECMTHxxujFqtWn1NmREREJqP3Zffl5eX48ssvkZycjJCQEDg5OWk9Hxsba7DirEnVlFkBrzIjIiKSnN6B6NSpU+jSpQsA6Kwl4lRa3WmmzEqVvOM9ERGRxPQORD///LMx6rB6VYGoXKVGabkKTnaPtWcmERER6aHOl91fvHgRO3fuxN27dwEAoigarChr5KCQw1Z+/473nDYjIiKSlN6BKD8/H88//zxatWqF/v37Izs7GwAwZswY3u3+MQiC8MDCal5pRkREJCW9A1F0dDQUCgWysrLg6OioOT5kyBDs2LHDoMVZG15pRkREZBp6L1RJSkrCzp070bRpU63jLVu2xJUrVwxWmDXyrOeA838U49rtUlOXQkREZFX0HiEqKSnRGhmqkpeXBzs7O4MUZa0C3Cq3MLh0s8TElRAREVkXvQNRjx49sH79es1jQRCgVquxZMkS9OrVy6DFWZsA98pAlMFAREREJCm9p8yWLFmCZ599FkeOHEF5eTnee+89nD59Grdu3cKBAweMUaPVCHBzBgBk5hWbuBIiIiLrovcIUdu2bXHixAl07doVffr0QUlJCV555RWkpaWhefPmxqjRavjfHyHKulWKCpXaxNUQERFZjzrt/ufp6Yl58+YZuhar5+VqD3uFDPeUaly9fRf+bk6PfhERERE9tjoFotu3byM+Ph7p6ekQBAGBgYH4xz/+gYYNGxq6Pqsikwnwd3NGenYhLt0sZiAiIiKSiN5TZikpKfD398eyZctw+/Zt3Lp1C8uWLYO/vz9SUlKMUaNVqbrSLDOPC6uJiIikovcI0bhx4/D6669j1apVkMvlAACVSoWoqCiMGzcOp06dMniR1oRXmhEREUlP7xGijIwMvPPOO5owBAByuRwxMTHIyMgwaHHWqCoQXbrJK82IiIikoncg6tKlC9LT03WOp6eno1OnToaoyar537/0/hKnzIiIiCRTqymzEydOaP4+ceJETJo0CRcvXsQzzzwDAPj111+xYsUKLF682DhVWpGqEaKbRWUouqeEi73CxBURERFZvloFok6dOkEQBIiiqDn23nvv6bQbNmwYhgwZYrjqrJCrvQINHBW4XarEtdt3EejFQERERGRstQpEmZmZxq6DHtDYxR63S5XIKy4zdSlERERWoVaByM/Pz9h10APcXexw7o8i3CxiICIiIpJCnTZmvH79Og4cOIDc3Fyo1dq3mJg4caJBCrNmbs62AMBAREREJBG9A9FXX32FyMhI2NraolGjRhAEQfOcIAgMRAbg7mIHAJwyIyIikojegWj27NmYPXs2pk+fDplM76v2qRaqAhFHiIiIiKShd6IpLS3FG2+8wTBkRJpAxBEiIiIiSeidakaPHo3//e9/xqiF7nN3tgfAESIiIiKp6D1ltmjRIgwYMAA7duxA+/btoVBo75MTGxtrsOKsFafMiIiIpKV3IFq4cCF27tyJ1q1bA4DOomp6fFVXmd0uVUKpUkMh5/QkERGRMekdiGJjY7F27VqMHDnSCOUQADRwtIVcJkClFpFfXA7PevamLomIiMii6T30YGdnh27duhmjFrpPJhO4FxEREZGE9A5EkyZNwmeffWaMWugBf15pds/ElRAREVk+vafMDh06hD179mDbtm1o166dzqLqLVu2GKw4a+buzIXVREREUtE7ENWvXx+vvPKKMWqhB/y5W3W5iSshIiKyfHW6dQcZnxtHiIiIiCTD67nNFPciIiIiko7eI0T+/v4P3W/o0qVLj1UQVWIgIiIiko7egWjy5Mlaj5VKJdLS0rBjxw68++67hqrL6mkWVfN+ZkREREandyCaNGlStcdXrFiBI0eOPHZBVEmzqJojREREREZnsDVE/fr1w+bNmw11Oqvn4Vq5O3VRWQUK7ylNXA0REZFlM1gg+vbbb9GwYUNDnc7qOdnZoJFT5W7VWfmlJq6GiIjIsuk9Zda5c2etRdWiKCInJwc3b97EypUrDVqctfNt5Ij8knJcvVWKoCb1TF0OERGRxdI7EL388staj2UyGdzd3fHss8+iTZs2hqqLAPg2dERaVgGu3OIIERERkTHpHYjmzJljjDqoGn4NHQEAWQxERERERsWNGc2Yz/1AdJWBiIiIyKhqPUIkk8keuiEjAAiCgIqKiscuiir5NXICAFzhomoiIiKjqnUg2rp1a43Ppaam4rPPPoMoigYpiir53h8hul5wFxUqNWzkHNAjIiIyhloHokGDBukcO3v2LKZPn44ff/wRw4cPx4IFCwxanLVr7GIHWxsZyivUyL5zTzOFRkRERIZVpyGHGzduYOzYsejQoQMqKiqQlpaGr7/+Gr6+voauz6rJZIJmlIjTZkRERMajVyC6c+cOpk6dihYtWuD06dPYvXs3fvzxR7Rv395Y9Vk9X15pRkREZHS1njL76KOP8OGHH8LT0xMbN26sdgqNDE8zQnSrxMSVEBERWa5aB6Jp06bBwcEBLVq0wNdff42vv/662nZbtmwxWHH0ZyDipfdERETGU+tANGLEiEdedk+G59eoMhAdu1KAGwV34V3fwcQVERERWZ5aB6J169YZsQyqSUizhnB3sUNO4T0MXH4AX496Cu28eV8zIiIiQ+LGNmaunoMCW6PCEOjlirziMizdec7UJREREVkcBqInQNMGjlgwqB0A4PwfxSauhoiIyPIwED0h/N0qb+NxveAu7ilVJq6GiIjIsjAQPSEaOtminoMCAJCZx0vwiYiIDMnkgWjlypXw9/eHvb09goODsW/fvoe2T0lJQXBwMOzt7REQEIDVq1drPb9mzRqEh4ejQYMGaNCgAXr37o1Dhw4Z8yNIQhAEBLhXjhJduslAREREZEgmDUQJCQmYPHkyZsyYgbS0NISHh6Nfv37Iysqqtn1mZib69++P8PBwpKWl4f3338fEiROxefNmTZu9e/di6NCh+Pnnn3Hw4EH4+voiIiIC169fl+pjGU3VtFlmHtcRERERGZJJA1FsbCxGjx6NMWPGIDAwEHFxcfDx8cGqVauqbb969Wr4+voiLi4OgYGBGDNmDEaNGoWlS5dq2mzYsAFRUVHo1KkT2rRpgzVr1kCtVmP37t1SfSyjae7uDIAjRERERIZW632IDK28vBxHjx7FtGnTtI5HREQgNTW12tccPHgQERERWsf69u2L+Ph4KJVKKBQKndeUlpZCqVSiYcOGNdZSVlaGsrIyzePCwkIAgFKphFKprPVnqo2q89XlvL4N7AEAF28WGbwuS/Y4fU76Y39Lj30uPfa59GrT54/z8zBZIMrLy4NKpYKHh4fWcQ8PD+Tk5FT7mpycnGrbV1RUIC8vD15eXjqvmTZtGpo0aYLevXvXWMuiRYswb948neNJSUlwdHSszcfRW3Jyst6vuVECADa4kH0HP/2UCG4crp+69DnVHftbeuxz6bHPpfewPi8trfttrkwWiKr89XYgoig+9BYh1bWv7jhQeUPajRs3Yu/evbC3t6/xnNOnT0dMTIzmcWFhIXx8fBAREQFXV9dafY7aUiqVSE5ORp8+faod0XqYe0oVPjq5G3dVAp7p+TwaOdsZtDZL9Th9Tvpjf0uPfS499rn0atPnVTM8dWGyQOTm5ga5XK4zGpSbm6szClTF09Oz2vY2NjZo1KiR1vGlS5di4cKF2LVrFzp06PDQWuzs7GBnpxsuFAqF0b7odTm3QqGAdz0HXC+4i6yCcng2cDZKbZbKmD9P0sX+lh77XHrsc+k9rM8f52dhskXVtra2CA4O1hn6Sk5ORlhYWLWvCQ0N1WmflJSEkJAQrU5YsmQJFixYgB07diAkJMTwxZvQn5fe80ozIiIiQzHpVWYxMTH48ssvsXbtWqSnpyM6OhpZWVmIjIwEUDmVNWLECE37yMhIXLlyBTExMUhPT8fatWsRHx+PKVOmaNp89NFHmDlzJtauXYtmzZohJycHOTk5KC62jABRdaVZBgMRERGRwZh0DdGQIUOQn5+P+fPnIzs7G0FBQUhMTISfnx8AIDs7W2tPIn9/fyQmJiI6OhorVqyAt7c3li1bhldffVXTZuXKlSgvL8drr72m9V5z5szB3LlzJflcxtTG0wUAcPpG3edJiYiISJvJF1VHRUUhKiqq2ufWrVunc6xnz544duxYjee7fPmygSozT+2b1gMAnLx+55EL0ImIiKh2TH7rDtJPKw8X2NrIUHSvAlfy6355IREREf2JgegJo5DLEOhVuRXAiet3TFwNERGRZWAgegJ1aFI5bXaKgYiIiMggGIieQO3vB6IT1wpMWwgREZGFYCB6AlUtrD51vRBqtWjiaoiIiJ58DERPoJaNnWFnI0NxWQUu55eYuhwiIqInHgPRE8hGLkNb78qF1ZsOX9Xcz42IiIjqxuT7EFHdvNK5CdKyCvDFL5dw/o8iBLg5o42XC14P8TF1aURERE8cBqIn1JuhzSCTCZjz/WnsPXcTe8/dBFC54LrqsnwiIiKqHQaiJ9jwp/0Q5F0PSWdysDs9F2dzirDrzB8MRERERHriGqInXEef+ni3bxuMDGsGANh1Nte0BRERET2BGIgsxHNtGgMAfr9agNzCeyauhoiI6MnCQGQhGrvao6NPfQDAHo4SERER6YWByIL0vj9KtCv9DxNXQkRE9GRhILIgvdt6AAB+uZCHq7dKTVwNERHRk4OByIK08XRBWPNGKK9QY96Pp01dDhER0RODgciCCIKA+YPaQSEXsCs9F7vOcOqMiIioNhiILEyLxi4Y3T0AABD9f8fxy/mbJq6IiIjI/DEQWaCJz7dAiF8DFN2rwMivDmHt/kze74yIiOghGIgskKOtDTaMfRqvBTeFWgTmbzuDaZtPorxCberSiIiIzBIDkYWys5FjyWsdMKN/IGQCkHDkKlanZJi6LCIiIrPEQGTBBEHA2B4BmDewHQDgpxPZJq6IiIjIPDEQWYGXOnpDLhNw7o8i7k9ERERUDQYiK1Df0RYhfg0AcBdrIiKi6jAQWYnegZW7WO9O533OiIiI/oqByEpU3dbj10v5OHOjEHdKlSauiIiIyHwwEFkJfzcnBLg7oUItov+yfXjqg13Y8NsVU5dFRERkFmxMXQBJJ7JncyzefhblFWoUl1VgxtZT2JOei0bOtghr7oaXOzcxdYlEREQmwUBkRV4P8cHrIT4QRREr92Zgyc5z2H22ck3R/45eg7+bEzr61DdtkURERCbAKTMrJAgCxvVqgW8jQ/Fu39bo1qIRRBGY9f0pqNS8xQcREVkfBiIrFtKsIcb1aoFPhnSCi50NTly7w3VFRERklRiICI1d7BET0QoAMPeH0/hy3yXeDJaIiKwKAxEBAEaENsOQEB+oReDfP6Vj6uYTKKtQmbosIiIiSTAQEQBALhOw+NX2mDWgLWQC8H9HruHvX/6GvOIyU5dGRERkdAxEpCEIAkZ398dX/+gKF3sbHL58G4OWH8CZG4WmLo2IiMioGIhIR89W7tga1Q3NGjniesFdvLY6FTtO5Zi6LCIiIqNhIKJqtWjsjO/GdUP3Fm4oLVch8j9HsXzPBS62JiIii8RARDWq72iLdf94CiPDmgEAliadx8RNx3FPycXWRERkWRiI6KFs5DLMHdgOCwe3h41MwI+/38Drnx9Ezp17pi6NiIjIYBiIqFaGPe2L/4x5Gg0cFThx7Q4GLt+P/ztyFT+dyMbNIl6JRkRETzbey4xq7ZmARvhhfHeM/vowzv9RjPe+PQEAqO+owKrhwQht3sjEFRIREdUNAxHpxaehI7ZEdcOSHWdx7o8iZN+5hyv5pXgz/je08nCBk50cUc+2QK82jU1dKhERUa0xEJHenO1sMG9QEADgnlKFd789gR9/v4Ez2ZX7FY36+jCie7dCZ9/68KrngBaNnU1ZLhER0SMxENFjsVfIseyNThjT3R8Fd5XYcSobGw9dRWzyeU2bic+1wOTerSCTCSaslIiIqGYMRPTYBEFAR5/6AIAeLd0Q1KQeEg5fxT2lCuf/KMayPRexKz0XDZwUeMa/Ed5+tjls5FzPT0RE5oOBiAxKEAQMf9oPw5/2AwB8e/Qa3t9yUjOdduBiPg5dvoWYPq0gvz9i5Ghrg+buThAEjiAREZFpMBCRUb0W3BQhfg3w+7UC5BWXY+nOc9h3IQ/7LuRptXuxvReW/K0DHG35lSQiIunxXx8yumZuTmjm5gQAeCagIWZ+dwp/PLCxY25RGX46mY2zOYVo7emiOd7G0xWRPZvD1obTa0REZFwMRCSpdt71sDWqm9axw5dvIfKbo8i4WYKMmyWa44knc7D/Yh7+/XIQ7G3kWq9xsJXD3cVOkpqJiMjyMRCRyT3VrCG2TwrH7rO5qFCpAQAl5Sqs2HMRhzJvIeKTX6p93fCnfTF3YDsouECbiIgeEwMRmYXGrvYY2tVX61jvwMaI+b/fkZFbrNO+pFyFDb9l4fSNQgR6uWqOt/V2xfCuvrzEn4iI9MJARGarRWMX/DC+e7XP7U7/AxM3puH41QIcv1qg9VzKuVwsHNwe9rZyuNorJKiUiIiedAxE9ER6PtAD2yaGY/upbKhUIgCguLwCXx24jF3pudiVvhsA0LOVO5a+GmTKUomI6AnAQERPLH83J0Q920Lr2AvtPDFp03Fk3SoFAKScv4nXPv8Nw3xMUSERET0pGIjIonT2bYCUd59FhVrEhT+KMXb9EVy5VYpP7sjRvMNN9G7nbeoSiYjIDDEQkcURBAEKuYC23q74YXw3/OubIzhypQBjvkkDkKbTvp23K5YN7Yzm7rwJLRGRtWIgIovWyNkOX48MwdhVSTh4UwZR1G1z+kYhXl5xAGPDAx65CaRvQ0f0C/LkbUaIiCwMAxFZPFsbGYY0V2P52OehFrQ3eCy+V4Ep//sdR67cRmzy+Vqd78UOXlj6Wkc42Mof3ZiIiJ4IDERkNVzsFVAotC/Dd3O2w4axT+PLfZnIzCup4ZWVKlRq/HQyGz+dyMbPZ3NrHE1ytrPBOxGtMLhzU4PVTkRExsVARFbPzkaOcb1aPLohgGFP++Ht/xxFfkk5SstV1bYpKFUiOuF3HLtSgLbertW2kQsCerZ2h4erfZ3rJiIiw2EgItJDV/+GODDtOVy7fbfGNlvTrmHFzxn45tcrDz1XfUcFVg0PRmjzRoYuk4iI9MRARKQne4UcLRrXfEXau33boH2T+vgu7Toq1NWs4gaQmVeMjJsleDP+NzRp4FCnOlzsbfBORGv0at24Tq8nIqI/MRARGcELQZ54IcizxufvKVV479sT+OH3G7iSX1rn9xm97jAmPNcSbTxd6vT6Zm5OWveCIyKyVgxERCZgr5Dj0zc6YcJzLVB4r6JO5/jfkavYdPgqPt194bFqienTChOea8GtBIjIqjEQEZmIIAho6VG3kR0A6OJbHx2a1scPv1+HWq3/68sqVPj92h3EJp/HD7/fgJOBthEQRRHlxTI06XAHIf5uBjknEZGxMRARPaEEQcCwp30x7GnfOp8j4XAWZn53Chdziw1YGQDIMCz+MGa9GGiYHcAFoGPT+nCy468sIjIO/nYhsmJDnvJFWHM3XMgtMtg5KypUWPbTUZy6Dcz6/rTBzutVzx5rRoQgqEk9g52TiKgKAxGRlfNp6Aifho4GO59SqUTJRTUuO7bEzjO5UFd3vxQ95ReXI/vOPby2OhVPNWtogCr108DRFtF9WsHfzUny9yYiaTAQEZHByQRgfK/miI5oY5DzFd5TYsJ/05By/ib2XcgzyDn1lXL+Jha/0h6+jQwXHuvCzkaG5u7OXARPZGAMRERk9lztFVg78in8cuEmCkrLJX1vUQS++fUK0rIK8PaGY5K+d026tWiEFcO6oL6jralLIbIYDERE9ESQywSTbULZv70XPvgpHcln/oCIx58CfBy3S5Q4cDEfLy3fj04+DQAAarUa2TdkSCo+AZms+nvsPak6+9THyLBmkMk4IkbGZfJAtHLlSixZsgTZ2dlo164d4uLiEB4eXmP7lJQUxMTE4PTp0/D29sZ7772HyMhIrTabN2/GrFmzkJGRgebNm+ODDz7A4MGDjf1RiMhC2SvkWPByEBa8HGTqUpCeXYgxXx/B1Vt3cfXWg7eQkQH5OSary1h+/P0GUjPyMPPFtrCRm08oqqiowK0y4HrBXdjYKE1dzhPJ1kaGxi7mcz9HkwaihIQETJ48GStXrkS3bt3w+eefo1+/fjhz5gx8fXUvJc7MzET//v0xduxY/Oc//8GBAwcQFRUFd3d3vPrqqwCAgwcPYsiQIViwYAEGDx6MrVu34vXXX8f+/fvx9NNPS/0RiYgMKtDLFdsmdMeO0zm4p6y8wbBKpcKZM2fQtm1byOWG2U/KHBSUKrEqJQO70nOxKz3X1OVUwwbzju0zdRFPrC6+9bElqpupy9AwaSCKjY3F6NGjMWbMGABAXFwcdu7ciVWrVmHRokU67VevXg1fX1/ExcUBAAIDA3HkyBEsXbpUE4ji4uLQp08fTJ8+HQAwffp0pKSkIC4uDhs3bqy2jrKyMpSVlWkeFxYWAqi8WkapNGzyrzqfoc9LNWOfS4v9bXzOtgJe6+yleaxUKpF8+zT6hHhDoVCYsDLDC2/REFO3nHroDZVNRa1SQWZBAVRqNjJBr98Ttfnd8ji/d0wWiMrLy3H06FFMmzZN63hERARSU1Orfc3BgwcRERGhdaxv376Ij4+HUqmEQqHAwYMHER0drdOmKkRVZ9GiRZg3b57O8aSkJDg6GueKkuTkZKOcl2rGPpcW+1t6ltrnE1uYuoKHUZm6gCfYTSQmJur9qod9z0tL635vSJMFory8PKhUKnh4eGgd9/DwQE5O9fPgOTk51bavqKhAXl4evLy8amxT0zmBylGkmJgYzePCwkL4+PggIiICrq6GvfGlUqlEcnIy+vTpY3H/JWeu2OfSYn9Lj30uPfa59GrT51UzPHVh8kXVf91LQxTFh+6vUV37vx7X95x2dnaws7PTOa5QKIz2RTfmual67HNpsb+lxz6XHvtceg/r88f5WZjs+kw3NzfI5XKdkZvc3FydEZ4qnp6e1ba3sbFBo0aNHtqmpnMSERERmSwQ2draIjg4WGcuMDk5GWFhYdW+JjQ0VKd9UlISQkJCNKmwpjY1nZOIiIjIpFNmMTExePPNNxESEoLQ0FB88cUXyMrK0uwrNH36dFy/fh3r168HAERGRmL58uWIiYnB2LFjcfDgQcTHx2tdPTZp0iT06NEDH374IQYNGoTvv/8eu3btwv79+03yGYmIiMj8mTQQDRkyBPn5+Zg/fz6ys7MRFBSExMRE+Pn5AQCys7ORlZWlae/v74/ExERER0djxYoV8Pb2xrJlyzSX3ANAWFgYNm3ahJkzZ2LWrFlo3rw5EhISuAcRERER1cjki6qjoqIQFRVV7XPr1q3TOdazZ08cO/bw+wm99tpreO211wxRHhEREVkBy7rpDREREVEdMBARERGR1WMgIiIiIqvHQERERERWj4GIiIiIrB4DEREREVk9BiIiIiKyeibfh8gcVd0w9nHumlsTpVKJ0tJSFBYW8oaAEmGfS4v9LT32ufTY59KrTZ9X/btd9e+4PhiIqlFUVAQA8PHxMXElREREpK+ioiLUq1dPr9cIYl1ilIVTq9W4ceMGXFxcIAiCQc9dWFgIHx8fXL16Fa6urgY9N1WPfS4t9rf02OfSY59LrzZ9LooiioqK4O3tDZlMv1VBHCGqhkwmQ9OmTY36Hq6urvw/kcTY59Jif0uPfS499rn0HtXn+o4MVeGiaiIiIrJ6DERERERk9RiIJGZnZ4c5c+bAzs7O1KVYDfa5tNjf0mOfS499Lj1j9zkXVRMREZHV4wgRERERWT0GIiIiIrJ6DERERERk9RiIiIiIyOoxEElo5cqV8Pf3h729PYKDg7Fv3z5Tl2Qx5s6dC0EQtP54enpqnhdFEXPnzoW3tzccHBzw7LPP4vTp0yas+Mnzyy+/4KWXXoK3tzcEQcB3332n9Xxt+risrAwTJkyAm5sbnJycMHDgQFy7dk3CT/HkeFR/jxw5Uuc7/8wzz2i1YX/rZ9GiRXjqqafg4uKCxo0b4+WXX8a5c+e02vB7bji16W8pv+cMRBJJSEjA5MmTMWPGDKSlpSE8PBz9+vVDVlaWqUuzGO3atUN2drbmz8mTJzXPffTRR4iNjcXy5ctx+PBheHp6ok+fPpr71tGjlZSUoGPHjli+fHm1z9emjydPnoytW7di06ZN2L9/P4qLizFgwACoVCqpPsYT41H9DQAvvPCC1nc+MTFR63n2t35SUlIwbtw4/Prrr0hOTkZFRQUiIiJQUlKiacPvueHUpr8BCb/nIkmia9euYmRkpNaxNm3aiNOmTTNRRZZlzpw5YseOHat9Tq1Wi56enuLixYs1x+7duyfWq1dPXL16tUQVWhYA4tatWzWPa9PHBQUFokKhEDdt2qRpc/36dVEmk4k7duyQrPYn0V/7WxRF8a233hIHDRpU42vY348vNzdXBCCmpKSIosjvubH9tb9FUdrvOUeIJFBeXo6jR48iIiJC63hERARSU1NNVJXluXDhAry9veHv74833ngDly5dAgBkZmYiJydHq//t7OzQs2dP9r+B1KaPjx49CqVSqdXG29sbQUFB/DnU0d69e9G4cWO0atUKY8eORW5uruY59vfju3PnDgCgYcOGAPg9N7a/9ncVqb7nDEQSyMvLg0qlgoeHh9ZxDw8P5OTkmKgqy/L0009j/fr12LlzJ9asWYOcnByEhYUhPz9f08fsf+OpTR/n5OTA1tYWDRo0qLEN1V6/fv2wYcMG7NmzBx9//DEOHz6M5557DmVlZQDY349LFEXExMSge/fuCAoKAsDvuTFV19+AtN9z3u1eQoIgaD0WRVHnGNVNv379NH9v3749QkND0bx5c3z99deaBXjsf+OrSx/z51A3Q4YM0fw9KCgIISEh8PPzw08//YRXXnmlxtexv2tn/PjxOHHiBPbv36/zHL/nhldTf0v5PecIkQTc3Nwgl8t10mpubq7Of2mQYTg5OaF9+/a4cOGC5moz9r/x1KaPPT09UV5ejtu3b9fYhurOy8sLfn5+uHDhAgD29+OYMGECfvjhB/z8889o2rSp5ji/58ZRU39Xx5jfcwYiCdja2iI4OBjJyclax5OTkxEWFmaiqixbWVkZ0tPT4eXlBX9/f3h6emr1f3l5OVJSUtj/BlKbPg4ODoZCodBqk52djVOnTvHnYAD5+fm4evUqvLy8ALC/60IURYwfPx5btmzBnj174O/vr/U8v+eG9aj+ro5Rv+d6LcGmOtu0aZOoUCjE+Ph48cyZM+LkyZNFJycn8fLly6YuzSK888474t69e8VLly6Jv/76qzhgwADRxcVF07+LFy8W69WrJ27ZskU8efKkOHToUNHLy0ssLCw0ceVPjqKiIjEtLU1MS0sTAYixsbFiWlqaeOXKFVEUa9fHkZGRYtOmTcVdu3aJx44dE5977jmxY8eOYkVFhak+ltl6WH8XFRWJ77zzjpiamipmZmaKP//8sxgaGio2adKE/f0Y3n77bbFevXri3r17xezsbM2f0tJSTRt+zw3nUf0t9fecgUhCK1asEP38/ERbW1uxS5cuWpcW0uMZMmSI6OXlJSoUCtHb21t85ZVXxNOnT2ueV6vV4pw5c0RPT0/Rzs5O7NGjh3jy5EkTVvzk+fnnn0UAOn/eeustURRr18d3794Vx48fLzZs2FB0cHAQBwwYIGZlZZng05i/h/V3aWmpGBERIbq7u4sKhUL09fUV33rrLZ2+ZH/rp7r+BiB+9dVXmjb8nhvOo/pb6u+5cL8oIiIiIqvFNURERERk9RiIiIiIyOoxEBEREZHVYyAiIiIiq8dARERERFaPgYiIiIisHgMRERERWT0GIiIiIrJ6DEREJAlBEPDdd9+ZtAZRFPHPf/4TDRs2hCAIOH78uEnrISLzYWPqAojIvI0cORIFBQUmDzOGsGPHDqxbtw579+5FQEAA3NzcTF0SEZkJBiIishoZGRnw8vLiXceJSAenzIhIL88++ywmTpyI9957Dw0bNoSnpyfmzp2r1ebChQvo0aMH7O3t0bZtWyQnJ+uc5/r16xgyZAgaNGiARo0aYdCgQbh8+TIA4OzZs3B0dMR///tfTfstW7bA3t4eJ0+erLG2lJQUdO3aFXZ2dvDy8sK0adNQUVEBoHKka8KECcjKyoIgCGjWrFm158jPz8fQoUPRtGlTODo6on379ti4ceMj+2XNmjXw8fGBo6MjBg8ejNjYWNSvX1+rzY8//ojg4GDY29sjICAA8+bN09QHVE4rfvnllxg8eDAcHR3RsmVL/PDDD1rnOHPmDPr37w9nZ2d4eHjgzTffRF5e3iPrI6JHeOzb1RKRRXvrrbfEQYMGaR737NlTdHV1FefOnSueP39e/Prrr0VBEMSkpCRRFEVRpVKJQUFB4rPPPiumpaWJKSkpYufOnUUA4tatW0VRFMWSkhKxZcuW4qhRo8QTJ06IZ86cEYcNGya2bt1aLCsrE0VRFFesWCHWq1dPvHz5snj9+nWxYcOG4ieffFJjndeuXRMdHR3FqKgoMT09Xdy6davo5uYmzpkzRxRFUSwoKBDnz58vNm3aVMzOzhZzc3NrPM+SJUvEtLQ0MSMjQ1y2bJkol8vFX3/9tcb33r9/vyiTycQlS5aI586dE1esWCE2bNhQrFevnqbNjh07RFdXV3HdunViRkaGmJSUJDZr1kycO3eupg0AsWnTpuJ///tf8cKFC+LEiRNFZ2dnMT8/XxRFUbxx44bo5uYmTp8+XUxPTxePHTsm9unTR+zVq1eNtRFR7TAQEdFDVReIunfvrtXmqaeeEqdOnSqKoiju3LlTlMvl4tWrVzXPb9++XSsQxcfHi61btxbVarWmTVlZmejg4CDu3LlTc+zFF18Uw8PDxeeff17s06ePVvu/ev/993XOuWLFCtHZ2VlUqVSiKIriJ598Ivr5+endB/379xffeeedGp8fMmSI+OKLL2odGz58uFYgCg8PFxcuXKjV5ptvvhG9vLw0jwGIM2fO1DwuLi4WBUEQt2/fLoqiKM6aNUuMiIjQOsfVq1dFAOK5c+f0/lxE9CeuISIivXXo0EHrsZeXF3JzcwEA6enp8PX1RdOmTTXPh4aGarU/evQoLl68CBcXF63j9+7dQ0ZGhubx2rVr0apVK8hkMpw6dQqCINRYU3p6OkJDQ7XadOvWDcXFxbh27Rp8fX1r9dlUKhUWL16MhIQEXL9+HWVlZSgrK4OTk1ONrzl37hwGDx6sdaxr167Ytm2b1mc+fPgwPvjgA633unfvHkpLS+Ho6AhAu2+dnJzg4uKi6dujR4/i559/hrOzs04NGRkZaNWqVa0+IxHpYiAiIr0pFAqtx4IgQK1WA6i8tP2v/hpk1Go1goODsWHDBp227u7umr///vvvKCkpgUwmQ05ODry9vWusSRRFnfepquVhQeqvPv74Y3zyySeIi4tD+/bt4eTkhMmTJ6O8vLxO711FrVZj3rx5eOWVV3Reb29vr/n7w/pWrVbjpZdewocffqhzDi8vr0d/OCKqEQMRERlU27ZtkZWVhRs3bmgCzMGDB7XadOnSBQkJCWjcuDFcXV2rPc+tW7cwcuRIzJgxAzk5ORg+fDiOHTsGBweHGt938+bNWuEkNTUVLi4uaNKkSa3r37dvHwYNGoS///3vACpDyIULFxAYGFjja9q0aYNDhw5pHTty5IjOZz537hxatGhR61r+qkuXLti8eTOaNWsGGxv++iYyJF5lRkQG1bt3b7Ru3RojRozA77//jn379mHGjBlabYYPHw43NzcMGjQI+/btQ2ZmJlJSUjBp0iRcu3YNABAZGQkfHx/MnDkTsbGxEEURU6ZMqfF9o6KicPXqVUyYMAFnz57F999/jzlz5iAmJgYyWe1/1bVo0QLJyclITU1Feno6/vWvfyEnJ+ehr5kwYQISExMRGxuLCxcu4PPPP8f27du1Ro1mz56N9evXY+7cuTh9+jTS09ORkJCAmTNn1rq2cePG4datWxg6dCgOHTqES5cuISkpCaNGjYJKpar1eYhIFwMRERmUTCbD1q1bUVZWhq5du2LMmDFa62YAwNHREb/88gt8fX3xyiuvIDAwEKNGjcLdu3fh6uqK9evXIzExEd988w1sbGzg6OiIDRs24Msvv0RiYmK179ukSRMkJibi0KFD6NixIyIjIzF69Gi9AgcAzJo1C126dEHfvn3x7LPPwtPTEy+//PJDX9OtWzesXr0asbGx6NixI3bs2IHo6GitqbC+ffti27ZtSE5OxlNPPYVnnnkGsbGx8PPzq3Vt3t7eOHDgAFQqFfr27YugoCBMmjQJ9erV0yv0EZEuQaxuwp+IiB7L2LFjcfbsWezbt8/UpRBRLXASmojIAJYuXYo+ffrAyckJ27dvx9dff42VK1eauiwiqiWOEBERGcDrr7+OvXv3oqioCAEBAZgwYQIiIyNNXRYR1RIDEREREVk9rsIjIiIiq8dARERERFaPgYiIiIisHgMRERERWT0GIiIiIrJ6DERERERk9RiIiIiIyOoxEBEREZHV+39k3yjJjnt+LQAAAABJRU5ErkJggg==", |
|
|
993 |
"text/plain": [ |
|
|
994 |
"<Figure size 640x480 with 1 Axes>" |
|
|
995 |
] |
|
|
996 |
}, |
|
|
997 |
"metadata": {}, |
|
|
998 |
"output_type": "display_data" |
|
|
999 |
} |
|
|
1000 |
], |
|
|
1001 |
"source": [ |
|
|
1002 |
"s= sum(unique_genes.values)\n", |
|
|
1003 |
"h = unique_genes.values/s\n", |
|
|
1004 |
"plt.plot(h, label = \"Histogram of Genes\")\n", |
|
|
1005 |
"plt.ylabel(\"Number of occurences\")\n", |
|
|
1006 |
"plt.xlabel(\"Index of a gene\")\n", |
|
|
1007 |
"plt.legend()\n", |
|
|
1008 |
"plt.grid()\n", |
|
|
1009 |
"plt.show()" |
|
|
1010 |
] |
|
|
1011 |
}, |
|
|
1012 |
{ |
|
|
1013 |
"cell_type": "code", |
|
|
1014 |
"execution_count": 22, |
|
|
1015 |
"metadata": {}, |
|
|
1016 |
"outputs": [ |
|
|
1017 |
{ |
|
|
1018 |
"data": { |
|
|
1019 |
"image/png": "", |
|
|
1020 |
"text/plain": [ |
|
|
1021 |
"<Figure size 640x480 with 1 Axes>" |
|
|
1022 |
] |
|
|
1023 |
}, |
|
|
1024 |
"metadata": {}, |
|
|
1025 |
"output_type": "display_data" |
|
|
1026 |
}, |
|
|
1027 |
{ |
|
|
1028 |
"name": "stdout", |
|
|
1029 |
"output_type": "stream", |
|
|
1030 |
"text": [ |
|
|
1031 |
"Top 50 most frequent genes nearly contribute to 75% of data\n" |
|
|
1032 |
] |
|
|
1033 |
} |
|
|
1034 |
], |
|
|
1035 |
"source": [ |
|
|
1036 |
"cdf = np.cumsum(h)\n", |
|
|
1037 |
"plt.plot(cdf, color =\"green\", label = \"Cumulative Distribution of Genes\")\n", |
|
|
1038 |
"plt.legend()\n", |
|
|
1039 |
"plt.grid()\n", |
|
|
1040 |
"plt.show()\n", |
|
|
1041 |
"print(\"Top 50 most frequent genes nearly contribute to 75% of data\")\n" |
|
|
1042 |
] |
|
|
1043 |
}, |
|
|
1044 |
{ |
|
|
1045 |
"attachments": {}, |
|
|
1046 |
"cell_type": "markdown", |
|
|
1047 |
"metadata": {}, |
|
|
1048 |
"source": [ |
|
|
1049 |
"Converting the genes into vectore by Response Coding and One-Hot Encoding\n" |
|
|
1050 |
] |
|
|
1051 |
}, |
|
|
1052 |
{ |
|
|
1053 |
"cell_type": "code", |
|
|
1054 |
"execution_count": 23, |
|
|
1055 |
"metadata": {}, |
|
|
1056 |
"outputs": [ |
|
|
1057 |
{ |
|
|
1058 |
"name": "stdout", |
|
|
1059 |
"output_type": "stream", |
|
|
1060 |
"text": [ |
|
|
1061 |
"Here's the first 10 items in the Gene array after response coding: \n", |
|
|
1062 |
"\n", |
|
|
1063 |
"('BRCA1', [0.10560344827586207, 0.0021551724137931034, 0.034482758620689655, 0.07543103448275862, 0.17672413793103448, 0.09267241379310345, 0.0021551724137931034, 0.0021551724137931034, 0.0021551724137931034])\n", |
|
|
1064 |
"('TP53', [0.1772853185595568, 0.00554016620498615, 0.00554016620498615, 0.1329639889196676, 0.008310249307479225, 0.0110803324099723, 0.002770083102493075, 0.002770083102493075, 0.002770083102493075])\n", |
|
|
1065 |
"('EGFR', [0.0056022408963585435, 0.10084033613445378, 0.0028011204481792717, 0.014005602240896359, 0.011204481792717087, 0.0028011204481792717, 0.19607843137254902, 0.0056022408963585435, 0.0028011204481792717])\n", |
|
|
1066 |
"('PTEN', [0.011428571428571429, 0.002857142857142857, 0.017142857142857144, 0.27714285714285714, 0.005714285714285714, 0.002857142857142857, 0.005714285714285714, 0.002857142857142857, 0.002857142857142857])\n", |
|
|
1067 |
"('BRCA2', [0.02702702702702703, 0.003003003003003003, 0.003003003003003003, 0.009009009009009009, 0.03003003003003003, 0.2132132132132132, 0.003003003003003003, 0.003003003003003003, 0.003003003003003003])\n", |
|
|
1068 |
"('KIT', [0.003076923076923077, 0.11076923076923077, 0.006153846153846154, 0.003076923076923077, 0.003076923076923077, 0.003076923076923077, 0.14153846153846153, 0.003076923076923077, 0.003076923076923077])\n", |
|
|
1069 |
"('BRAF', [0.003205128205128205, 0.07371794871794872, 0.00641025641025641, 0.003205128205128205, 0.019230769230769232, 0.01282051282051282, 0.12179487179487179, 0.003205128205128205, 0.003205128205128205])\n", |
|
|
1070 |
"('ERBB2', [0.006600660066006601, 0.039603960396039604, 0.0033003300330033004, 0.0165016501650165, 0.019801980198019802, 0.039603960396039604, 0.0891089108910891, 0.006600660066006601, 0.0033003300330033004])\n", |
|
|
1071 |
"('ALK', [0.0033112582781456954, 0.026490066225165563, 0.026490066225165563, 0.0033112582781456954, 0.019867549668874173, 0.0033112582781456954, 0.13245033112582782, 0.0033112582781456954, 0.0033112582781456954])\n", |
|
|
1072 |
"('PDGFRA', [0.006802721088435374, 0.013605442176870748, 0.030612244897959183, 0.006802721088435374, 0.017006802721088437, 0.003401360544217687, 0.11564625850340136, 0.003401360544217687, 0.003401360544217687])\n" |
|
|
1073 |
] |
|
|
1074 |
} |
|
|
1075 |
], |
|
|
1076 |
"source": [ |
|
|
1077 |
"#In Response Coding, each level of a categorical variable is assigned a probability value based \n", |
|
|
1078 |
"#on the propoertion of positive class observations for that level.\n", |
|
|
1079 |
"\n", |
|
|
1080 |
"#response coding for a gene using laplace smoothing \n", |
|
|
1081 |
"alpha = 1\n", |
|
|
1082 |
"\n", |
|
|
1083 |
"#Creating a mehtod for gene one-hot encoding that can be used for all 3 data sets \n", |
|
|
1084 |
"def gene_response_coding(df):\n", |
|
|
1085 |
" gene_dict = dict()\n", |
|
|
1086 |
" values = df['Gene'].value_counts()\n", |
|
|
1087 |
" for gene, freq in values.items():\n", |
|
|
1088 |
" prob_vec = []\n", |
|
|
1089 |
" for cancer_class in range(1,10):\n", |
|
|
1090 |
" class_occurence = df.loc[(df['Class'] == cancer_class) & (df['Gene'] == gene)]\n", |
|
|
1091 |
" prob_vec.append((class_occurence.shape[0] + alpha)/(freq + (alpha * values.shape[0])))\n", |
|
|
1092 |
" gene_dict[gene] = prob_vec\n", |
|
|
1093 |
" \n", |
|
|
1094 |
" return gene_dict\n", |
|
|
1095 |
"\n", |
|
|
1096 |
"\n", |
|
|
1097 |
"train_gene_responseCoding = gene_response_coding(X_train)\n", |
|
|
1098 |
"cv_gene_responseCoding = gene_response_coding(X_vaildation)\n", |
|
|
1099 |
"test_gene_responseCoding = gene_response_coding(X_test)\n", |
|
|
1100 |
"print(\"Here's the first 10 items in the Gene array after response coding: \\n\")\n", |
|
|
1101 |
"from itertools import islice\n", |
|
|
1102 |
"n_items = list(islice(train_gene_responseCoding.items(), 10))\n", |
|
|
1103 |
"for i in range(0,10):\n", |
|
|
1104 |
" print(n_items[i]) \n", |
|
|
1105 |
"\n", |
|
|
1106 |
"\n", |
|
|
1107 |
"\n", |
|
|
1108 |
"\n", |
|
|
1109 |
"\n" |
|
|
1110 |
] |
|
|
1111 |
}, |
|
|
1112 |
{ |
|
|
1113 |
"cell_type": "code", |
|
|
1114 |
"execution_count": 24, |
|
|
1115 |
"metadata": {}, |
|
|
1116 |
"outputs": [], |
|
|
1117 |
"source": [ |
|
|
1118 |
"# So if you look at the first entry BRCA 1, its mutatations have the highest probability of being in class 5 \n", |
|
|
1119 |
"#Then, the probability of the mutation falling under class 1, 4, 6 is nearly the same " |
|
|
1120 |
] |
|
|
1121 |
}, |
|
|
1122 |
{ |
|
|
1123 |
"cell_type": "code", |
|
|
1124 |
"execution_count": 25, |
|
|
1125 |
"metadata": {}, |
|
|
1126 |
"outputs": [ |
|
|
1127 |
{ |
|
|
1128 |
"name": "stdout", |
|
|
1129 |
"output_type": "stream", |
|
|
1130 |
"text": [ |
|
|
1131 |
"The 'Gene' column of the training dataset once transformed using one hot encoding has shape: (2656, 244)\n" |
|
|
1132 |
] |
|
|
1133 |
} |
|
|
1134 |
], |
|
|
1135 |
"source": [ |
|
|
1136 |
"\n", |
|
|
1137 |
"#One Hot Encoding is a representation of categorical variables as binary vectors to be used in a machine learning model.\n", |
|
|
1138 |
"\n", |
|
|
1139 |
"from sklearn.preprocessing import OneHotEncoder\n", |
|
|
1140 |
"encoder = OneHotEncoder(handle_unknown='ignore')\n", |
|
|
1141 |
"train_gene_oneHotEncoding = encoder.fit_transform(X_train[['Gene']])\n", |
|
|
1142 |
"cv_gene_oneHotEncoding = encoder.transform(X_vaildation[['Gene']])\n", |
|
|
1143 |
"test_gene_oneHotEncoding = encoder.transform(X_test[['Gene']])\n", |
|
|
1144 |
"\n", |
|
|
1145 |
"print(\"The 'Gene' column of the training dataset once transformed using one hot encoding has shape: \", train_gene_oneHotEncoding.shape)" |
|
|
1146 |
] |
|
|
1147 |
}, |
|
|
1148 |
{ |
|
|
1149 |
"cell_type": "code", |
|
|
1150 |
"execution_count": 26, |
|
|
1151 |
"metadata": {}, |
|
|
1152 |
"outputs": [ |
|
|
1153 |
{ |
|
|
1154 |
"name": "stdout", |
|
|
1155 |
"output_type": "stream", |
|
|
1156 |
"text": [ |
|
|
1157 |
"When model trained on just 'Gene' feature, the log loss for test data is 1.6249787609089872\n" |
|
|
1158 |
] |
|
|
1159 |
}, |
|
|
1160 |
{ |
|
|
1161 |
"name": "stderr", |
|
|
1162 |
"output_type": "stream", |
|
|
1163 |
"text": [ |
|
|
1164 |
"c:\\Users\\chawl\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n", |
|
|
1165 |
" warnings.warn(\n" |
|
|
1166 |
] |
|
|
1167 |
} |
|
|
1168 |
], |
|
|
1169 |
"source": [ |
|
|
1170 |
"#Building a model just based on the gene feature \n", |
|
|
1171 |
"from sklearn.linear_model import SGDClassifier\n", |
|
|
1172 |
"from sklearn.calibration import CalibratedClassifierCV\n", |
|
|
1173 |
"sgdc_gene_model = SGDClassifier() # default method = 'sigmoid' (logistic regression model) \n", |
|
|
1174 |
"sgdc_gene_model.fit(train_gene_oneHotEncoding, y_train)\n", |
|
|
1175 |
"\n", |
|
|
1176 |
"#Calibrated Classification\n", |
|
|
1177 |
"calibrator = CalibratedClassifierCV(sgdc_gene_model)\n", |
|
|
1178 |
"calibrator.fit(cv_gene_oneHotEncoding, y_validation)\n", |
|
|
1179 |
"ypredict_cal_test = calibrator.predict_proba(test_gene_oneHotEncoding) \n", |
|
|
1180 |
"\n", |
|
|
1181 |
"# printed for my own understanding \n", |
|
|
1182 |
"#print(y_test) \n", |
|
|
1183 |
"#print(y_test.shape)\n", |
|
|
1184 |
"#print(ypredict_cal_test[:, 1])\n", |
|
|
1185 |
"#print(ypredict_cal_test[:, 1].shape)\n", |
|
|
1186 |
"from sklearn.metrics import log_loss\n", |
|
|
1187 |
"log_loss_test = log_loss(y_test, ypredict_cal_test, labels=sgdc_gene_model.classes_)\n", |
|
|
1188 |
"print(\"When model trained on just 'Gene' feature, the log loss for test data is \", log_loss_test)\n", |
|
|
1189 |
"\n", |
|
|
1190 |
"\n" |
|
|
1191 |
] |
|
|
1192 |
}, |
|
|
1193 |
{ |
|
|
1194 |
"cell_type": "code", |
|
|
1195 |
"execution_count": 27, |
|
|
1196 |
"metadata": {}, |
|
|
1197 |
"outputs": [ |
|
|
1198 |
{ |
|
|
1199 |
"name": "stdout", |
|
|
1200 |
"output_type": "stream", |
|
|
1201 |
"text": [ |
|
|
1202 |
"Number of unique genes in training set: 244\n", |
|
|
1203 |
"Number of unique genes in cross-validation set 109\n", |
|
|
1204 |
"Number of unique genes in testing set 109\n", |
|
|
1205 |
"Percentage of genes in cross-validation set that have not been encountered in the training dataset = 11.926605504587156\n", |
|
|
1206 |
"Percentage of genes in Testing set that have not been encountered in the training dataset = 6.422018348623854\n" |
|
|
1207 |
] |
|
|
1208 |
} |
|
|
1209 |
], |
|
|
1210 |
"source": [ |
|
|
1211 |
"#Question: How many genes in the cross-validation and testing set are the same unique_genes that were encountered in the training set? \n", |
|
|
1212 |
"print(\"Number of unique genes in training set: \", unique_genes.shape[0])\n", |
|
|
1213 |
"unique_genes_cv = X_vaildation['Gene'].value_counts()\n", |
|
|
1214 |
"unique_genes_test = X_test['Gene'].value_counts()\n", |
|
|
1215 |
"print(\"Number of unique genes in cross-validation set \", unique_genes_cv.shape[0])\n", |
|
|
1216 |
"print(\"Number of unique genes in testing set \", unique_genes_test.shape[0])\n", |
|
|
1217 |
"\n", |
|
|
1218 |
"counter = 0\n", |
|
|
1219 |
"\n", |
|
|
1220 |
"for i in unique_genes_cv.index:\n", |
|
|
1221 |
" if i not in unique_genes.index:\n", |
|
|
1222 |
" counter +=1\n", |
|
|
1223 |
"\n", |
|
|
1224 |
"print(\"Percentage of genes in cross-validation set that have not been encountered in the training dataset = \", counter/unique_genes_cv.shape[0]*100)\n", |
|
|
1225 |
"\n", |
|
|
1226 |
"counter2 =0\n", |
|
|
1227 |
"for i in unique_genes_test.index:\n", |
|
|
1228 |
" if i not in unique_genes.index:\n", |
|
|
1229 |
" counter2 +=1\n", |
|
|
1230 |
"\n", |
|
|
1231 |
"print(\"Percentage of genes in Testing set that have not been encountered in the training dataset = \", counter2/unique_genes_test.shape[0]*100)\n" |
|
|
1232 |
] |
|
|
1233 |
}, |
|
|
1234 |
{ |
|
|
1235 |
"attachments": {}, |
|
|
1236 |
"cell_type": "markdown", |
|
|
1237 |
"metadata": {}, |
|
|
1238 |
"source": [ |
|
|
1239 |
"Variation feature" |
|
|
1240 |
] |
|
|
1241 |
}, |
|
|
1242 |
{ |
|
|
1243 |
"cell_type": "code", |
|
|
1244 |
"execution_count": 28, |
|
|
1245 |
"metadata": {}, |
|
|
1246 |
"outputs": [ |
|
|
1247 |
{ |
|
|
1248 |
"name": "stdout", |
|
|
1249 |
"output_type": "stream", |
|
|
1250 |
"text": [ |
|
|
1251 |
"Number of variations in training set: 2656\n", |
|
|
1252 |
"Total number of unique variations in training set: 2406\n", |
|
|
1253 |
"Top ten most frequently occuring variations are:\n", |
|
|
1254 |
" Truncating_Mutations 70\n", |
|
|
1255 |
"Amplification 59\n", |
|
|
1256 |
"Deletion 57\n", |
|
|
1257 |
"Fusions 28\n", |
|
|
1258 |
"Overexpression 5\n", |
|
|
1259 |
"G12V 4\n", |
|
|
1260 |
"Q61L 3\n", |
|
|
1261 |
"E17K 3\n", |
|
|
1262 |
"F384L 2\n", |
|
|
1263 |
"Q209L 2\n", |
|
|
1264 |
"Name: Variation, dtype: int64\n" |
|
|
1265 |
] |
|
|
1266 |
} |
|
|
1267 |
], |
|
|
1268 |
"source": [ |
|
|
1269 |
"#Number of unique variations \n", |
|
|
1270 |
"\n", |
|
|
1271 |
"print(\"Number of variations in training set: \", X_train['Variation'].shape[0])\n", |
|
|
1272 |
"unique_variations = X_train['Variation'].value_counts()\n", |
|
|
1273 |
"print(\"Total number of unique variations in training set: \", unique_variations.shape[0])\n", |
|
|
1274 |
"print(\"Top ten most frequently occuring variations are:\\n\", unique_variations.head(10))" |
|
|
1275 |
] |
|
|
1276 |
}, |
|
|
1277 |
{ |
|
|
1278 |
"cell_type": "code", |
|
|
1279 |
"execution_count": 29, |
|
|
1280 |
"metadata": {}, |
|
|
1281 |
"outputs": [ |
|
|
1282 |
{ |
|
|
1283 |
"name": "stdout", |
|
|
1284 |
"output_type": "stream", |
|
|
1285 |
"text": [ |
|
|
1286 |
"Percentage of variations in cross-validation set that have not been encountered in the training dataset = 96.36963696369637\n", |
|
|
1287 |
"Percentage of variations in cross-validation set that have not been encountered in the training dataset = 96.45161290322581\n" |
|
|
1288 |
] |
|
|
1289 |
} |
|
|
1290 |
], |
|
|
1291 |
"source": [ |
|
|
1292 |
"#Overlap between training, CV, and Testing set \n", |
|
|
1293 |
"unique_variations_cv = X_vaildation['Variation'].value_counts()\n", |
|
|
1294 |
"\n", |
|
|
1295 |
"counter3 =0\n", |
|
|
1296 |
"for i in unique_variations_cv.index:\n", |
|
|
1297 |
" if i not in unique_variations.index:\n", |
|
|
1298 |
" counter3 +=1\n", |
|
|
1299 |
"\n", |
|
|
1300 |
"print(\"Percentage of variations in cross-validation set that have not been encountered in the training dataset = \", (counter3/unique_variations_cv.shape[0])*100)\n", |
|
|
1301 |
"\n", |
|
|
1302 |
"unique_variations_test = X_test['Variation'].value_counts()\n", |
|
|
1303 |
"\n", |
|
|
1304 |
"counter4 =0\n", |
|
|
1305 |
"for i in unique_variations_test.index:\n", |
|
|
1306 |
" if i not in unique_variations.index:\n", |
|
|
1307 |
" counter4 +=1\n", |
|
|
1308 |
"\n", |
|
|
1309 |
"print(\"Percentage of variations in cross-validation set that have not been encountered in the training dataset = \", (counter4/unique_variations_test.shape[0])*100)\n", |
|
|
1310 |
"\n", |
|
|
1311 |
"\n" |
|
|
1312 |
] |
|
|
1313 |
}, |
|
|
1314 |
{ |
|
|
1315 |
"cell_type": "code", |
|
|
1316 |
"execution_count": 30, |
|
|
1317 |
"metadata": {}, |
|
|
1318 |
"outputs": [ |
|
|
1319 |
{ |
|
|
1320 |
"data": { |
|
|
1321 |
"image/png": "", |
|
|
1322 |
"text/plain": [ |
|
|
1323 |
"<Figure size 640x480 with 1 Axes>" |
|
|
1324 |
] |
|
|
1325 |
}, |
|
|
1326 |
"metadata": {}, |
|
|
1327 |
"output_type": "display_data" |
|
|
1328 |
} |
|
|
1329 |
], |
|
|
1330 |
"source": [ |
|
|
1331 |
"#histogram to see frequency of variations\n", |
|
|
1332 |
"s2= sum(unique_variations.values)\n", |
|
|
1333 |
"h2 = unique_variations.values/s2\n", |
|
|
1334 |
"plt.plot(h2, label = \"Histogram of Variations\")\n", |
|
|
1335 |
"plt.ylabel(\"Number of occurences\")\n", |
|
|
1336 |
"plt.xlabel(\"Index of a Variation\")\n", |
|
|
1337 |
"plt.legend()\n", |
|
|
1338 |
"plt.grid()\n", |
|
|
1339 |
"plt.show()" |
|
|
1340 |
] |
|
|
1341 |
}, |
|
|
1342 |
{ |
|
|
1343 |
"cell_type": "code", |
|
|
1344 |
"execution_count": 31, |
|
|
1345 |
"metadata": {}, |
|
|
1346 |
"outputs": [ |
|
|
1347 |
{ |
|
|
1348 |
"data": { |
|
|
1349 |
"image/png": "", |
|
|
1350 |
"text/plain": [ |
|
|
1351 |
"<Figure size 640x480 with 1 Axes>" |
|
|
1352 |
] |
|
|
1353 |
}, |
|
|
1354 |
"metadata": {}, |
|
|
1355 |
"output_type": "display_data" |
|
|
1356 |
}, |
|
|
1357 |
{ |
|
|
1358 |
"name": "stdout", |
|
|
1359 |
"output_type": "stream", |
|
|
1360 |
"text": [ |
|
|
1361 |
"Constant growth of cdf graph, meaning almost all genes contribute equally to the dataset\n" |
|
|
1362 |
] |
|
|
1363 |
} |
|
|
1364 |
], |
|
|
1365 |
"source": [ |
|
|
1366 |
"#cumulative distribution of variations \n", |
|
|
1367 |
"cdf2 = np.cumsum(h2)\n", |
|
|
1368 |
"plt.plot(cdf2, color =\"green\", label = \"Cumulative Distribution of Variations\")\n", |
|
|
1369 |
"plt.legend()\n", |
|
|
1370 |
"plt.grid()\n", |
|
|
1371 |
"plt.show()\n", |
|
|
1372 |
"print(\"Constant growth of cdf graph, meaning almost all genes contribute equally to the dataset\")" |
|
|
1373 |
] |
|
|
1374 |
}, |
|
|
1375 |
{ |
|
|
1376 |
"cell_type": "code", |
|
|
1377 |
"execution_count": 32, |
|
|
1378 |
"metadata": {}, |
|
|
1379 |
"outputs": [ |
|
|
1380 |
{ |
|
|
1381 |
"name": "stdout", |
|
|
1382 |
"output_type": "stream", |
|
|
1383 |
"text": [ |
|
|
1384 |
"Here's the first 10 items in the Variation array after response coding: \n", |
|
|
1385 |
"\n", |
|
|
1386 |
"('Truncating_Mutations', [0.027463651050080775, 0.0008077544426494346, 0.0004038772213247173, 0.0008077544426494346, 0.0004038772213247173, 0.0008077544426494346, 0.0004038772213247173, 0.0004038772213247173, 0.0004038772213247173])\n", |
|
|
1387 |
"('Amplification', [0.00040567951318458417, 0.008113590263691683, 0.00040567951318458417, 0.00040567951318458417, 0.00040567951318458417, 0.004056795131845842, 0.012981744421906694, 0.00040567951318458417, 0.00040567951318458417])\n", |
|
|
1388 |
"('Deletion', [0.017864393016646368, 0.0004060089321965083, 0.0004060089321965083, 0.005684125050751117, 0.0004060089321965083, 0.0008120178643930166, 0.0004060089321965083, 0.0004060089321965083, 0.0004060089321965083])\n", |
|
|
1389 |
"('Fusions', [0.0012325390304026294, 0.010682004930156122, 0.0004108463434675431, 0.0004108463434675431, 0.0004108463434675431, 0.0004108463434675431, 0.0004108463434675431, 0.0008216926869350862, 0.0004108463434675431])\n", |
|
|
1390 |
"('Overexpression', [0.00041476565740356696, 0.001244296972210701, 0.00041476565740356696, 0.00041476565740356696, 0.00041476565740356696, 0.00041476565740356696, 0.0016590626296142678, 0.00041476565740356696, 0.00041476565740356696])\n", |
|
|
1391 |
"('G12V', [0.0004149377593360996, 0.0004149377593360996, 0.0004149377593360996, 0.0004149377593360996, 0.0004149377593360996, 0.0004149377593360996, 0.002074688796680498, 0.0004149377593360996, 0.0004149377593360996])\n", |
|
|
1392 |
"('Q61L', [0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.0016604400166044002, 0.00041511000415110004, 0.00041511000415110004])\n", |
|
|
1393 |
"('E17K', [0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.00041511000415110004, 0.0016604400166044002, 0.00041511000415110004, 0.00041511000415110004])\n", |
|
|
1394 |
"('F384L', [0.0004152823920265781, 0.0008305647840531562, 0.0004152823920265781, 0.0004152823920265781, 0.0008305647840531562, 0.0004152823920265781, 0.0004152823920265781, 0.0004152823920265781, 0.0004152823920265781])\n", |
|
|
1395 |
"('Q209L', [0.0004152823920265781, 0.0004152823920265781, 0.0004152823920265781, 0.0004152823920265781, 0.0004152823920265781, 0.0004152823920265781, 0.0012458471760797341, 0.0004152823920265781, 0.0004152823920265781])\n", |
|
|
1396 |
"\n", |
|
|
1397 |
"\n", |
|
|
1398 |
"From the first 10 variations shown above, we can see that if it is a Truncating Mutation, it has the highest probability to belong to class 1\n" |
|
|
1399 |
] |
|
|
1400 |
} |
|
|
1401 |
], |
|
|
1402 |
"source": [ |
|
|
1403 |
"#Response Coding of variations\n", |
|
|
1404 |
"def variation_response_coding(df):\n", |
|
|
1405 |
" var_dict = dict()\n", |
|
|
1406 |
" values = df['Variation'].value_counts()\n", |
|
|
1407 |
" for var, freq in values.items():\n", |
|
|
1408 |
" prob_vec = []\n", |
|
|
1409 |
" for cancer_class in range(1,10):\n", |
|
|
1410 |
" class_occurence = df.loc[(df['Class'] == cancer_class) & (df['Variation'] == var)]\n", |
|
|
1411 |
" prob_vec.append((class_occurence.shape[0] + alpha)/(freq + (alpha * values.shape[0])))\n", |
|
|
1412 |
" var_dict[var] = prob_vec\n", |
|
|
1413 |
" \n", |
|
|
1414 |
" return var_dict\n", |
|
|
1415 |
"\n", |
|
|
1416 |
"\n", |
|
|
1417 |
"train_var_responseCoding = variation_response_coding(X_train)\n", |
|
|
1418 |
"cv_var_responseCoding = variation_response_coding(X_vaildation)\n", |
|
|
1419 |
"test_var_responseCoding = variation_response_coding(X_test)\n", |
|
|
1420 |
"print(\"Here's the first 10 items in the Variation array after response coding: \\n\")\n", |
|
|
1421 |
"from itertools import islice\n", |
|
|
1422 |
"n_items2 = list(islice(train_var_responseCoding.items(), 10))\n", |
|
|
1423 |
"for i in range(0,10):\n", |
|
|
1424 |
" print(n_items2[i]) \n", |
|
|
1425 |
"\n", |
|
|
1426 |
"print(\"\\n\")\n", |
|
|
1427 |
"print(\"From the first 10 variations shown above, we can see that if it is a Truncating Mutation, it has the highest probability to belong to class 1\")" |
|
|
1428 |
] |
|
|
1429 |
}, |
|
|
1430 |
{ |
|
|
1431 |
"cell_type": "code", |
|
|
1432 |
"execution_count": 33, |
|
|
1433 |
"metadata": {}, |
|
|
1434 |
"outputs": [ |
|
|
1435 |
{ |
|
|
1436 |
"name": "stdout", |
|
|
1437 |
"output_type": "stream", |
|
|
1438 |
"text": [ |
|
|
1439 |
"The 'Variation' column of the training dataset once transformed using one hot encoding has shape: (2656, 2406) \n", |
|
|
1440 |
"\n", |
|
|
1441 |
"Printing the first row: \n", |
|
|
1442 |
" (0, 1765)\t1.0\n" |
|
|
1443 |
] |
|
|
1444 |
} |
|
|
1445 |
], |
|
|
1446 |
"source": [ |
|
|
1447 |
"#one-hot encoding of variations \n", |
|
|
1448 |
"\n", |
|
|
1449 |
"train_var_oneHotEncoding = encoder.fit_transform(X_train[['Variation']])\n", |
|
|
1450 |
"cv_var_oneHotEncoding = encoder.transform(X_vaildation[['Variation']])\n", |
|
|
1451 |
"test_var_oneHotEncoding = encoder.transform(X_test[['Variation']])\n", |
|
|
1452 |
"\n", |
|
|
1453 |
"\n", |
|
|
1454 |
"print(\"The 'Variation' column of the training dataset once transformed using one hot encoding has shape: \", train_var_oneHotEncoding.shape, \"\\n\")\n", |
|
|
1455 |
"print(\"Printing the first row: \")\n", |
|
|
1456 |
"print(train_var_oneHotEncoding[1])" |
|
|
1457 |
] |
|
|
1458 |
}, |
|
|
1459 |
{ |
|
|
1460 |
"cell_type": "code", |
|
|
1461 |
"execution_count": 34, |
|
|
1462 |
"metadata": {}, |
|
|
1463 |
"outputs": [ |
|
|
1464 |
{ |
|
|
1465 |
"name": "stdout", |
|
|
1466 |
"output_type": "stream", |
|
|
1467 |
"text": [ |
|
|
1468 |
"When model trained on just 'Variation' feature, the log loss for test data is 1.799630234372917\n" |
|
|
1469 |
] |
|
|
1470 |
}, |
|
|
1471 |
{ |
|
|
1472 |
"name": "stderr", |
|
|
1473 |
"output_type": "stream", |
|
|
1474 |
"text": [ |
|
|
1475 |
"c:\\Users\\chawl\\anaconda3\\lib\\site-packages\\sklearn\\model_selection\\_split.py:676: UserWarning: The least populated class in y has only 1 members, which is less than n_splits=5.\n", |
|
|
1476 |
" warnings.warn(\n" |
|
|
1477 |
] |
|
|
1478 |
} |
|
|
1479 |
], |
|
|
1480 |
"source": [ |
|
|
1481 |
"#Building a model just based on 'Variation' feature and seeing how effective it is in prediciting 'Class'\n", |
|
|
1482 |
"#for future purposes, alpha could be varied to see how the model performance increases or decreses, but for my first project I'll just go with default alpha = 0.0001\n", |
|
|
1483 |
"\n", |
|
|
1484 |
"sgdc_var_model = SGDClassifier() # default method = 'sigmoid' (logistic regression model) \n", |
|
|
1485 |
"sgdc_var_model.fit(train_var_oneHotEncoding, y_train)\n", |
|
|
1486 |
"\n", |
|
|
1487 |
"#Calibrated Classification\n", |
|
|
1488 |
"calibrator2 = CalibratedClassifierCV(sgdc_var_model)\n", |
|
|
1489 |
"calibrator2.fit(cv_var_oneHotEncoding, y_validation)\n", |
|
|
1490 |
"ypredict_cal_test2 = calibrator2.predict_proba(test_var_oneHotEncoding) \n", |
|
|
1491 |
"\n", |
|
|
1492 |
"log_loss_test_var = log_loss(y_test, ypredict_cal_test2, labels=sgdc_var_model.classes_)\n", |
|
|
1493 |
"print(\"When model trained on just 'Variation' feature, the log loss for test data is \", log_loss_test_var)" |
|
|
1494 |
] |
|
|
1495 |
}, |
|
|
1496 |
{ |
|
|
1497 |
"attachments": {}, |
|
|
1498 |
"cell_type": "markdown", |
|
|
1499 |
"metadata": {}, |
|
|
1500 |
"source": [ |
|
|
1501 |
"'Text' Feature " |
|
|
1502 |
] |
|
|
1503 |
}, |
|
|
1504 |
{ |
|
|
1505 |
"cell_type": "code", |
|
|
1506 |
"execution_count": 35, |
|
|
1507 |
"metadata": {}, |
|
|
1508 |
"outputs": [ |
|
|
1509 |
{ |
|
|
1510 |
"name": "stdout", |
|
|
1511 |
"output_type": "stream", |
|
|
1512 |
"text": [ |
|
|
1513 |
"Total number of unique words in training set: 138265\n", |
|
|
1514 |
"Total number of unique words in cross-validation set: 54558\n", |
|
|
1515 |
"Total number of unique words in testing set: 53565 \n", |
|
|
1516 |
"\n", |
|
|
1517 |
"Percentage of words in cross-validation set that have not been encountered in the training dataset = 14.10059019758789\n", |
|
|
1518 |
"Percentage of words in testing set that have not been encountered in the training dataset = 14.177167926817885\n" |
|
|
1519 |
] |
|
|
1520 |
} |
|
|
1521 |
], |
|
|
1522 |
"source": [ |
|
|
1523 |
"#Overlap between Train, CV and Test set \n", |
|
|
1524 |
"\n", |
|
|
1525 |
"from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n", |
|
|
1526 |
"\n", |
|
|
1527 |
"train_text_allwords = CountVectorizer()\n", |
|
|
1528 |
"train_text_allwords_vec = train_text_allwords.fit_transform(X_train['Text'])\n", |
|
|
1529 |
"print(\"Total number of unique words in training set: \", len(train_text_allwords.vocabulary_))\n", |
|
|
1530 |
"\n", |
|
|
1531 |
"text_cv_vectorizer = CountVectorizer()\n", |
|
|
1532 |
"validation_text_onehotencoding = text_cv_vectorizer.fit_transform(X_vaildation['Text'])\n", |
|
|
1533 |
"print(\"Total number of unique words in cross-validation set: \", len(text_cv_vectorizer.vocabulary_))\n", |
|
|
1534 |
"\n", |
|
|
1535 |
"text_test_vectorizer = CountVectorizer()\n", |
|
|
1536 |
"test_text_onehotencoding = text_test_vectorizer.fit_transform(X_test['Text'])\n", |
|
|
1537 |
"print(\"Total number of unique words in testing set: \", len(text_test_vectorizer.vocabulary_), \"\\n\")\n", |
|
|
1538 |
"\n", |
|
|
1539 |
"counter5 =0\n", |
|
|
1540 |
"for i in text_cv_vectorizer.vocabulary_.keys():\n", |
|
|
1541 |
" if i not in train_text_allwords.vocabulary_.keys():\n", |
|
|
1542 |
" counter5 +=1\n", |
|
|
1543 |
"\n", |
|
|
1544 |
"#print(counter5)\n", |
|
|
1545 |
"\n", |
|
|
1546 |
"print(\"Percentage of words in cross-validation set that have not been encountered in the training dataset = \", (counter5/len(text_cv_vectorizer.vocabulary_))*100)\n", |
|
|
1547 |
"\n", |
|
|
1548 |
"\n", |
|
|
1549 |
"counter6 =0\n", |
|
|
1550 |
"for i in text_test_vectorizer.vocabulary_.keys():\n", |
|
|
1551 |
" if i not in train_text_allwords.vocabulary_.keys():\n", |
|
|
1552 |
" counter6 +=1\n", |
|
|
1553 |
"\n", |
|
|
1554 |
"\n", |
|
|
1555 |
"print(\"Percentage of words in testing set that have not been encountered in the training dataset = \", (counter6/len(text_test_vectorizer.vocabulary_))*100)\n" |
|
|
1556 |
] |
|
|
1557 |
}, |
|
|
1558 |
{ |
|
|
1559 |
"cell_type": "code", |
|
|
1560 |
"execution_count": 36, |
|
|
1561 |
"metadata": {}, |
|
|
1562 |
"outputs": [ |
|
|
1563 |
{ |
|
|
1564 |
"name": "stderr", |
|
|
1565 |
"output_type": "stream", |
|
|
1566 |
"text": [ |
|
|
1567 |
"c:\\Users\\chawl\\anaconda3\\lib\\site-packages\\sklearn\\utils\\deprecation.py:87: FutureWarning: Function get_feature_names is deprecated; get_feature_names is deprecated in 1.0 and will be removed in 1.2. Please use get_feature_names_out instead.\n", |
|
|
1568 |
" warnings.warn(msg, category=FutureWarning)\n" |
|
|
1569 |
] |
|
|
1570 |
}, |
|
|
1571 |
{ |
|
|
1572 |
"name": "stdout", |
|
|
1573 |
"output_type": "stream", |
|
|
1574 |
"text": [ |
|
|
1575 |
"Total number of unique words that occured atleast 5 times in training set: 43391\n" |
|
|
1576 |
] |
|
|
1577 |
} |
|
|
1578 |
], |
|
|
1579 |
"source": [ |
|
|
1580 |
"# one-hot encoding \n", |
|
|
1581 |
"#for words that occured atleast five times in the dataset \n", |
|
|
1582 |
"\n", |
|
|
1583 |
"\n", |
|
|
1584 |
"#building TfidVectorizer with all words that occured min 5 times in training data\n", |
|
|
1585 |
"\n", |
|
|
1586 |
"text_vectorizer = TfidfVectorizer(min_df =5, binary=True, use_idf=True)\n", |
|
|
1587 |
"train_text_onehotencoding = text_vectorizer.fit_transform(X_train['Text'])\n", |
|
|
1588 |
"# getting all the feature names (words)\n", |
|
|
1589 |
"train_text_features= text_vectorizer.get_feature_names()\n", |
|
|
1590 |
"print(\"Total number of unique words that occured atleast 5 times in training set: \", len(train_text_features))\n", |
|
|
1591 |
"\n", |
|
|
1592 |
"cv_text_onehotencoding = text_vectorizer.transform(X_vaildation['Text'])\n", |
|
|
1593 |
"\n", |
|
|
1594 |
"test_text_onehotencoding = text_vectorizer.transform(X_test['Text'])\n", |
|
|
1595 |
"\n", |
|
|
1596 |
"\n", |
|
|
1597 |
"#normalizing to [0,1]\n", |
|
|
1598 |
"from sklearn.preprocessing import normalize\n", |
|
|
1599 |
"train_text_onehotencoding = normalize(train_text_onehotencoding, axis =0)\n", |
|
|
1600 |
"cv_text_onehotencoding = normalize(cv_text_onehotencoding, axis =0)\n", |
|
|
1601 |
"test_text_onehotencoding = normalize(test_text_onehotencoding, axis =0)\n" |
|
|
1602 |
] |
|
|
1603 |
}, |
|
|
1604 |
{ |
|
|
1605 |
"cell_type": "code", |
|
|
1606 |
"execution_count": 37, |
|
|
1607 |
"metadata": {}, |
|
|
1608 |
"outputs": [ |
|
|
1609 |
{ |
|
|
1610 |
"name": "stdout", |
|
|
1611 |
"output_type": "stream", |
|
|
1612 |
"text": [ |
|
|
1613 |
"When model trained on just 'Text' feature, the log loss for test data is 1.3097283788034129\n" |
|
|
1614 |
] |
|
|
1615 |
} |
|
|
1616 |
], |
|
|
1617 |
"source": [ |
|
|
1618 |
"#building a model based on just 'Text' feature to see how well it is able to predict class\n", |
|
|
1619 |
"\n", |
|
|
1620 |
"sgdc_text_model = SGDClassifier(loss = 'log', random_state=42, penalty=\"l2\") # default method = 'sigmoid' (logistic regression model) \n", |
|
|
1621 |
"sgdc_text_model.fit(train_text_onehotencoding, y_train)\n", |
|
|
1622 |
"\n", |
|
|
1623 |
"#Calibrated Classification\n", |
|
|
1624 |
"calibrator3 = CalibratedClassifierCV(sgdc_text_model, method = 'sigmoid')\n", |
|
|
1625 |
"calibrator3.fit(train_text_onehotencoding, y_train)\n", |
|
|
1626 |
"predict_y = calibrator3.predict_proba(cv_text_onehotencoding) \n", |
|
|
1627 |
"\n", |
|
|
1628 |
"log_loss_test_text = log_loss(y_validation, predict_y, labels=sgdc_text_model.classes_)\n", |
|
|
1629 |
"print(\"When model trained on just 'Text' feature, the log loss for test data is \", log_loss_test_text)" |
|
|
1630 |
] |
|
|
1631 |
}, |
|
|
1632 |
{ |
|
|
1633 |
"attachments": {}, |
|
|
1634 |
"cell_type": "markdown", |
|
|
1635 |
"metadata": {}, |
|
|
1636 |
"source": [ |
|
|
1637 |
"Stacking all 3 features to prepare for model " |
|
|
1638 |
] |
|
|
1639 |
}, |
|
|
1640 |
{ |
|
|
1641 |
"cell_type": "code", |
|
|
1642 |
"execution_count": 38, |
|
|
1643 |
"metadata": {}, |
|
|
1644 |
"outputs": [ |
|
|
1645 |
{ |
|
|
1646 |
"name": "stdout", |
|
|
1647 |
"output_type": "stream", |
|
|
1648 |
"text": [ |
|
|
1649 |
"(2656, 244)\n", |
|
|
1650 |
"<class 'scipy.sparse.csr.csr_matrix'>\n", |
|
|
1651 |
"(2656, 2406)\n", |
|
|
1652 |
"<class 'scipy.sparse.csr.csr_matrix'>\n" |
|
|
1653 |
] |
|
|
1654 |
} |
|
|
1655 |
], |
|
|
1656 |
"source": [ |
|
|
1657 |
"print(train_gene_oneHotEncoding.shape)\n", |
|
|
1658 |
"print(type(train_gene_oneHotEncoding))\n", |
|
|
1659 |
"print(train_var_oneHotEncoding.shape)\n", |
|
|
1660 |
"print(type(train_var_oneHotEncoding))" |
|
|
1661 |
] |
|
|
1662 |
}, |
|
|
1663 |
{ |
|
|
1664 |
"cell_type": "code", |
|
|
1665 |
"execution_count": 39, |
|
|
1666 |
"metadata": {}, |
|
|
1667 |
"outputs": [], |
|
|
1668 |
"source": [ |
|
|
1669 |
"from scipy.sparse import hstack\n", |
|
|
1670 |
"\n", |
|
|
1671 |
"train_gv_ohc = hstack((train_var_oneHotEncoding, train_gene_oneHotEncoding))\n", |
|
|
1672 |
"cv_gv_ohc = hstack((cv_gene_oneHotEncoding, cv_var_oneHotEncoding))\n", |
|
|
1673 |
"test_gv_ohc = hstack((test_gene_oneHotEncoding, test_var_oneHotEncoding))\n", |
|
|
1674 |
"\n", |
|
|
1675 |
"train_X_onehotencoding = hstack((train_gv_ohc, train_text_onehotencoding))\n", |
|
|
1676 |
"\n", |
|
|
1677 |
"cv_X_onehotencoding = hstack((cv_gv_ohc, cv_text_onehotencoding))\n", |
|
|
1678 |
"\n", |
|
|
1679 |
"test_X_onehotencoding = hstack((test_gv_ohc, test_text_onehotencoding))\n" |
|
|
1680 |
] |
|
|
1681 |
}, |
|
|
1682 |
{ |
|
|
1683 |
"cell_type": "code", |
|
|
1684 |
"execution_count": 40, |
|
|
1685 |
"metadata": {}, |
|
|
1686 |
"outputs": [ |
|
|
1687 |
{ |
|
|
1688 |
"name": "stdout", |
|
|
1689 |
"output_type": "stream", |
|
|
1690 |
"text": [ |
|
|
1691 |
"(Number of records, number of features) in the final dataset use for training is: (2656, 46041)\n", |
|
|
1692 |
"(Number of records, number of features) in the final dataset use for cross-validation is: (332, 46041)\n", |
|
|
1693 |
"(Number of records, number of features) in the final dataset use for testing is: (333, 46041)\n" |
|
|
1694 |
] |
|
|
1695 |
} |
|
|
1696 |
], |
|
|
1697 |
"source": [ |
|
|
1698 |
"#Final dataset features \n", |
|
|
1699 |
"\n", |
|
|
1700 |
"print(\"(Number of records, number of features) in the final dataset use for training is: \", train_X_onehotencoding.shape)\n", |
|
|
1701 |
"print(\"(Number of records, number of features) in the final dataset use for cross-validation is: \", cv_X_onehotencoding.shape)\n", |
|
|
1702 |
"print(\"(Number of records, number of features) in the final dataset use for testing is: \", test_X_onehotencoding.shape)" |
|
|
1703 |
] |
|
|
1704 |
}, |
|
|
1705 |
{ |
|
|
1706 |
"attachments": {}, |
|
|
1707 |
"cell_type": "markdown", |
|
|
1708 |
"metadata": {}, |
|
|
1709 |
"source": [ |
|
|
1710 |
"#### Machine Learning Models" |
|
|
1711 |
] |
|
|
1712 |
}, |
|
|
1713 |
{ |
|
|
1714 |
"attachments": {}, |
|
|
1715 |
"cell_type": "markdown", |
|
|
1716 |
"metadata": {}, |
|
|
1717 |
"source": [ |
|
|
1718 |
"Linear Support Vector Machine " |
|
|
1719 |
] |
|
|
1720 |
}, |
|
|
1721 |
{ |
|
|
1722 |
"cell_type": "code", |
|
|
1723 |
"execution_count": 41, |
|
|
1724 |
"metadata": {}, |
|
|
1725 |
"outputs": [ |
|
|
1726 |
{ |
|
|
1727 |
"name": "stdout", |
|
|
1728 |
"output_type": "stream", |
|
|
1729 |
"text": [ |
|
|
1730 |
"Log Loss with Support Vector Machine on Training set: 0.49413293509235245\n", |
|
|
1731 |
"Log Loss with Support Vector Machine on Cross-Validatiob set: 1.379463690206224\n", |
|
|
1732 |
"Log Loss with Support Vector Machine on Testing set: 1.2134525598397619\n" |
|
|
1733 |
] |
|
|
1734 |
} |
|
|
1735 |
], |
|
|
1736 |
"source": [ |
|
|
1737 |
"svm = SGDClassifier(loss='hinge', random_state=42) \n", |
|
|
1738 |
"#loss ='hinge' gives linear SVM\n", |
|
|
1739 |
"#default penalty='l2' which is the standard regulizer for SVM \n", |
|
|
1740 |
"\n", |
|
|
1741 |
"svm.fit(train_X_onehotencoding, y_train)\n", |
|
|
1742 |
"calibrated_svm = CalibratedClassifierCV(svm, method= 'sigmoid')\n", |
|
|
1743 |
"calibrated_svm.fit(train_X_onehotencoding, y_train)\n", |
|
|
1744 |
"calibrated_svm_cv_predict = calibrated_svm.predict_proba(cv_X_onehotencoding)\n", |
|
|
1745 |
"calibrated_svm_train_predict = calibrated_svm.predict_proba(train_X_onehotencoding)\n", |
|
|
1746 |
"calibrated_svm_test_predict = calibrated_svm.predict_proba(test_X_onehotencoding)\n", |
|
|
1747 |
"log_loss_svm_cv = log_loss(y_validation, calibrated_svm_cv_predict)\n", |
|
|
1748 |
"log_loss_svm_train = log_loss(y_train, calibrated_svm_train_predict)\n", |
|
|
1749 |
"log_loss_svm_test = log_loss(y_test, calibrated_svm_test_predict)\n", |
|
|
1750 |
"\n", |
|
|
1751 |
"\n", |
|
|
1752 |
"print(\"Log Loss with Support Vector Machine on Training set: \", log_loss_svm_train)\n", |
|
|
1753 |
"print(\"Log Loss with Support Vector Machine on Cross-Validatiob set: \", log_loss_svm_cv)\n", |
|
|
1754 |
"print(\"Log Loss with Support Vector Machine on Testing set: \", log_loss_svm_test)\n", |
|
|
1755 |
"\n", |
|
|
1756 |
"\n" |
|
|
1757 |
] |
|
|
1758 |
}, |
|
|
1759 |
{ |
|
|
1760 |
"attachments": {}, |
|
|
1761 |
"cell_type": "markdown", |
|
|
1762 |
"metadata": {}, |
|
|
1763 |
"source": [ |
|
|
1764 |
"##### Evaluating The SVM Model Performance " |
|
|
1765 |
] |
|
|
1766 |
}, |
|
|
1767 |
{ |
|
|
1768 |
"cell_type": "code", |
|
|
1769 |
"execution_count": 42, |
|
|
1770 |
"metadata": {}, |
|
|
1771 |
"outputs": [ |
|
|
1772 |
{ |
|
|
1773 |
"name": "stdout", |
|
|
1774 |
"output_type": "stream", |
|
|
1775 |
"text": [ |
|
|
1776 |
" precision recall f1-score support\n", |
|
|
1777 |
"\n", |
|
|
1778 |
" 1 0.74 0.45 0.56 56\n", |
|
|
1779 |
" 2 0.72 0.46 0.56 46\n", |
|
|
1780 |
" 3 0.33 0.33 0.33 6\n", |
|
|
1781 |
" 4 0.64 0.76 0.70 74\n", |
|
|
1782 |
" 5 0.41 0.33 0.37 21\n", |
|
|
1783 |
" 6 0.81 0.64 0.71 33\n", |
|
|
1784 |
" 7 0.63 0.91 0.74 87\n", |
|
|
1785 |
" 8 0.00 0.00 0.00 2\n", |
|
|
1786 |
" 9 0.71 0.62 0.67 8\n", |
|
|
1787 |
"\n", |
|
|
1788 |
" accuracy 0.65 333\n", |
|
|
1789 |
" macro avg 0.56 0.50 0.51 333\n", |
|
|
1790 |
"weighted avg 0.66 0.65 0.63 333\n", |
|
|
1791 |
"\n" |
|
|
1792 |
] |
|
|
1793 |
} |
|
|
1794 |
], |
|
|
1795 |
"source": [ |
|
|
1796 |
"from sklearn.metrics import classification_report\n", |
|
|
1797 |
"\n", |
|
|
1798 |
"y_test_predict = calibrated_svm.predict(test_X_onehotencoding)\n", |
|
|
1799 |
"print(classification_report(y_test, y_test_predict, zero_division =0))" |
|
|
1800 |
] |
|
|
1801 |
}, |
|
|
1802 |
{ |
|
|
1803 |
"cell_type": "code", |
|
|
1804 |
"execution_count": 63, |
|
|
1805 |
"metadata": {}, |
|
|
1806 |
"outputs": [ |
|
|
1807 |
{ |
|
|
1808 |
"data": { |
|
|
1809 |
"image/png": "", |
|
|
1810 |
"text/plain": [ |
|
|
1811 |
"<Figure size 640x480 with 2 Axes>" |
|
|
1812 |
] |
|
|
1813 |
}, |
|
|
1814 |
"metadata": {}, |
|
|
1815 |
"output_type": "display_data" |
|
|
1816 |
} |
|
|
1817 |
], |
|
|
1818 |
"source": [ |
|
|
1819 |
"from sklearn.metrics import plot_confusion_matrix, confusion_matrix,ConfusionMatrixDisplay\n", |
|
|
1820 |
"confusion_matrix_svm = confusion_matrix(y_test, y_test_predict)\n", |
|
|
1821 |
"disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix_svm, display_labels=calibrated_svm.classes_)\n", |
|
|
1822 |
"disp.plot(cmap = \"PuRd\")\n", |
|
|
1823 |
"plt.xlabel(\"Predicted Class\")\n", |
|
|
1824 |
"plt.ylabel(\"True Class\")\n", |
|
|
1825 |
"plt.title(\"Confusion Matrix for SVM model\")\n", |
|
|
1826 |
"plt.show()" |
|
|
1827 |
] |
|
|
1828 |
} |
|
|
1829 |
], |
|
|
1830 |
"metadata": { |
|
|
1831 |
"kernelspec": { |
|
|
1832 |
"display_name": "base", |
|
|
1833 |
"language": "python", |
|
|
1834 |
"name": "python3" |
|
|
1835 |
}, |
|
|
1836 |
"language_info": { |
|
|
1837 |
"codemirror_mode": { |
|
|
1838 |
"name": "ipython", |
|
|
1839 |
"version": 3 |
|
|
1840 |
}, |
|
|
1841 |
"file_extension": ".py", |
|
|
1842 |
"mimetype": "text/x-python", |
|
|
1843 |
"name": "python", |
|
|
1844 |
"nbconvert_exporter": "python", |
|
|
1845 |
"pygments_lexer": "ipython3", |
|
|
1846 |
"version": "3.9.12" |
|
|
1847 |
}, |
|
|
1848 |
"orig_nbformat": 4 |
|
|
1849 |
}, |
|
|
1850 |
"nbformat": 4, |
|
|
1851 |
"nbformat_minor": 2 |
|
|
1852 |
} |