a b/LUNG_CANCER_logistic_regression.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "Lung_cancer: Logistic regression\n",
8
    "---\n",
9
    "---"
10
   ]
11
  },
12
  {
13
   "cell_type": "markdown",
14
   "metadata": {},
15
   "source": [
16
    "<a id='imporp'></a>\n",
17
    "## Importing Packages\n",
18
    "---"
19
   ]
20
  },
21
  {
22
   "cell_type": "code",
23
   "execution_count": 2,
24
   "metadata": {},
25
   "outputs": [],
26
   "source": [
27
    "## Basic packages\n",
28
    "import numpy as np\n",
29
    "import pandas as pd\n",
30
    "\n",
31
    "\n",
32
    "## Graphing packages\n",
33
    "import seaborn as sns\n",
34
    "import matplotlib.pyplot as plt\n",
35
    "plt.style.use('fivethirtyeight')\n",
36
    "\n",
37
    "## Scikit learn and Statsmodel packages\n",
38
    "from sklearn.linear_model import LogisticRegression, LinearRegression\n",
39
    "import statsmodels.api as sm\n",
40
    "from sklearn.metrics import confusion_matrix\n",
41
    "## Operating system dependent functionality\n",
42
    "import os\n",
43
    "import statsmodels.api as st \n",
44
    "#from pandas.stats.api import ols\n",
45
    "## Lines of code needed to make sure graph(s) appear in notebook, and check versions of packages\n",
46
    "%matplotlib inline\n",
47
    "#%load_ext watermark\n",
48
    "#%config InlineBackend.figure_format = 'retina'\n",
49
    "#%watermark -v -d -a 'Delta Analytics' -p scikit-learn,matplotlib,numpy,pandas"
50
   ]
51
  },
52
  {
53
   "cell_type": "markdown",
54
   "metadata": {},
55
   "source": [
56
    "<a id='rds'></a>\n",
57
    "## Reading the dataset\n",
58
    "---\n",
59
    "we are using Lung_Cancer dataset"
60
   ]
61
  },
62
  {
63
   "cell_type": "code",
64
   "execution_count": 3,
65
   "metadata": {},
66
   "outputs": [
67
    {
68
     "data": {
69
      "text/html": [
70
       "<div>\n",
71
       "<style scoped>\n",
72
       "    .dataframe tbody tr th:only-of-type {\n",
73
       "        vertical-align: middle;\n",
74
       "    }\n",
75
       "\n",
76
       "    .dataframe tbody tr th {\n",
77
       "        vertical-align: top;\n",
78
       "    }\n",
79
       "\n",
80
       "    .dataframe thead th {\n",
81
       "        text-align: right;\n",
82
       "    }\n",
83
       "</style>\n",
84
       "<table border=\"1\" class=\"dataframe\">\n",
85
       "  <thead>\n",
86
       "    <tr style=\"text-align: right;\">\n",
87
       "      <th></th>\n",
88
       "      <th>patient_id</th>\n",
89
       "      <th>age</th>\n",
90
       "      <th>gender</th>\n",
91
       "      <th>air_pollution</th>\n",
92
       "      <th>alcohol_use</th>\n",
93
       "      <th>dust_allergy</th>\n",
94
       "      <th>occupational_hazards</th>\n",
95
       "      <th>genetic_risk</th>\n",
96
       "      <th>chronic_lung_disease</th>\n",
97
       "      <th>balanced_diet</th>\n",
98
       "      <th>...</th>\n",
99
       "      <th>fatigue</th>\n",
100
       "      <th>weight_loss</th>\n",
101
       "      <th>shortness_of_breath</th>\n",
102
       "      <th>wheezing</th>\n",
103
       "      <th>swallowing_difficulty</th>\n",
104
       "      <th>clubbing_of_finger_nails</th>\n",
105
       "      <th>frequent_cold</th>\n",
106
       "      <th>dry_cough</th>\n",
107
       "      <th>snoring</th>\n",
108
       "      <th>level</th>\n",
109
       "    </tr>\n",
110
       "  </thead>\n",
111
       "  <tbody>\n",
112
       "    <tr>\n",
113
       "      <th>0</th>\n",
114
       "      <td>P1</td>\n",
115
       "      <td>33</td>\n",
116
       "      <td>1</td>\n",
117
       "      <td>2</td>\n",
118
       "      <td>4</td>\n",
119
       "      <td>5</td>\n",
120
       "      <td>4</td>\n",
121
       "      <td>3</td>\n",
122
       "      <td>2</td>\n",
123
       "      <td>2</td>\n",
124
       "      <td>...</td>\n",
125
       "      <td>3</td>\n",
126
       "      <td>4</td>\n",
127
       "      <td>2</td>\n",
128
       "      <td>2</td>\n",
129
       "      <td>3</td>\n",
130
       "      <td>1</td>\n",
131
       "      <td>2</td>\n",
132
       "      <td>3</td>\n",
133
       "      <td>4</td>\n",
134
       "      <td>Low</td>\n",
135
       "    </tr>\n",
136
       "    <tr>\n",
137
       "      <th>1</th>\n",
138
       "      <td>P10</td>\n",
139
       "      <td>17</td>\n",
140
       "      <td>1</td>\n",
141
       "      <td>3</td>\n",
142
       "      <td>1</td>\n",
143
       "      <td>5</td>\n",
144
       "      <td>3</td>\n",
145
       "      <td>4</td>\n",
146
       "      <td>2</td>\n",
147
       "      <td>2</td>\n",
148
       "      <td>...</td>\n",
149
       "      <td>1</td>\n",
150
       "      <td>3</td>\n",
151
       "      <td>7</td>\n",
152
       "      <td>8</td>\n",
153
       "      <td>6</td>\n",
154
       "      <td>2</td>\n",
155
       "      <td>1</td>\n",
156
       "      <td>7</td>\n",
157
       "      <td>2</td>\n",
158
       "      <td>Medium</td>\n",
159
       "    </tr>\n",
160
       "    <tr>\n",
161
       "      <th>2</th>\n",
162
       "      <td>P107</td>\n",
163
       "      <td>44</td>\n",
164
       "      <td>1</td>\n",
165
       "      <td>6</td>\n",
166
       "      <td>7</td>\n",
167
       "      <td>7</td>\n",
168
       "      <td>7</td>\n",
169
       "      <td>7</td>\n",
170
       "      <td>6</td>\n",
171
       "      <td>7</td>\n",
172
       "      <td>...</td>\n",
173
       "      <td>5</td>\n",
174
       "      <td>3</td>\n",
175
       "      <td>2</td>\n",
176
       "      <td>7</td>\n",
177
       "      <td>8</td>\n",
178
       "      <td>2</td>\n",
179
       "      <td>4</td>\n",
180
       "      <td>5</td>\n",
181
       "      <td>3</td>\n",
182
       "      <td>High</td>\n",
183
       "    </tr>\n",
184
       "  </tbody>\n",
185
       "</table>\n",
186
       "<p>3 rows × 25 columns</p>\n",
187
       "</div>"
188
      ],
189
      "text/plain": [
190
       "  patient_id  age  gender  air_pollution  alcohol_use  dust_allergy  \\\n",
191
       "0         P1   33       1              2            4             5   \n",
192
       "1        P10   17       1              3            1             5   \n",
193
       "2       P107   44       1              6            7             7   \n",
194
       "\n",
195
       "   occupational_hazards  genetic_risk  chronic_lung_disease  balanced_diet  \\\n",
196
       "0                     4             3                     2              2   \n",
197
       "1                     3             4                     2              2   \n",
198
       "2                     7             7                     6              7   \n",
199
       "\n",
200
       "    ...    fatigue  weight_loss  shortness_of_breath  wheezing  \\\n",
201
       "0   ...          3            4                    2         2   \n",
202
       "1   ...          1            3                    7         8   \n",
203
       "2   ...          5            3                    2         7   \n",
204
       "\n",
205
       "   swallowing_difficulty  clubbing_of_finger_nails  frequent_cold  dry_cough  \\\n",
206
       "0                      3                         1              2          3   \n",
207
       "1                      6                         2              1          7   \n",
208
       "2                      8                         2              4          5   \n",
209
       "\n",
210
       "   snoring   level  \n",
211
       "0        4     Low  \n",
212
       "1        2  Medium  \n",
213
       "2        3    High  \n",
214
       "\n",
215
       "[3 rows x 25 columns]"
216
      ]
217
     },
218
     "execution_count": 3,
219
     "metadata": {},
220
     "output_type": "execute_result"
221
    }
222
   ],
223
   "source": [
224
    "LUNG_CANCER_filepath = os.path.join('cancer_patient.csv')\n",
225
    "LUNG_CANCER = pd.read_csv(LUNG_CANCER_filepath)\n",
226
    "LUNG_CANCER.head(3)"
227
   ]
228
  },
229
  {
230
   "cell_type": "code",
231
   "execution_count": 44,
232
   "metadata": {},
233
   "outputs": [
234
    {
235
     "data": {
236
      "text/html": [
237
       "<div>\n",
238
       "<style scoped>\n",
239
       "    .dataframe tbody tr th:only-of-type {\n",
240
       "        vertical-align: middle;\n",
241
       "    }\n",
242
       "\n",
243
       "    .dataframe tbody tr th {\n",
244
       "        vertical-align: top;\n",
245
       "    }\n",
246
       "\n",
247
       "    .dataframe thead th {\n",
248
       "        text-align: right;\n",
249
       "    }\n",
250
       "</style>\n",
251
       "<table border=\"1\" class=\"dataframe\">\n",
252
       "  <thead>\n",
253
       "    <tr style=\"text-align: right;\">\n",
254
       "      <th></th>\n",
255
       "      <th>patient_id</th>\n",
256
       "      <th>age</th>\n",
257
       "      <th>gender</th>\n",
258
       "      <th>air_pollution</th>\n",
259
       "      <th>alcohol_use</th>\n",
260
       "      <th>dust_allergy</th>\n",
261
       "      <th>occupational_hazards</th>\n",
262
       "      <th>genetic_risk</th>\n",
263
       "      <th>chronic_lung_disease</th>\n",
264
       "      <th>balanced_diet</th>\n",
265
       "      <th>...</th>\n",
266
       "      <th>fatigue</th>\n",
267
       "      <th>weight_loss</th>\n",
268
       "      <th>shortness_of_breath</th>\n",
269
       "      <th>wheezing</th>\n",
270
       "      <th>swallowing_difficulty</th>\n",
271
       "      <th>clubbing_of_finger_nails</th>\n",
272
       "      <th>frequent_cold</th>\n",
273
       "      <th>dry_cough</th>\n",
274
       "      <th>snoring</th>\n",
275
       "      <th>level</th>\n",
276
       "    </tr>\n",
277
       "  </thead>\n",
278
       "  <tbody>\n",
279
       "    <tr>\n",
280
       "      <th>997</th>\n",
281
       "      <td>P997</td>\n",
282
       "      <td>25</td>\n",
283
       "      <td>2</td>\n",
284
       "      <td>4</td>\n",
285
       "      <td>5</td>\n",
286
       "      <td>6</td>\n",
287
       "      <td>5</td>\n",
288
       "      <td>5</td>\n",
289
       "      <td>4</td>\n",
290
       "      <td>6</td>\n",
291
       "      <td>...</td>\n",
292
       "      <td>8</td>\n",
293
       "      <td>7</td>\n",
294
       "      <td>9</td>\n",
295
       "      <td>2</td>\n",
296
       "      <td>1</td>\n",
297
       "      <td>4</td>\n",
298
       "      <td>6</td>\n",
299
       "      <td>7</td>\n",
300
       "      <td>2</td>\n",
301
       "      <td>High</td>\n",
302
       "    </tr>\n",
303
       "    <tr>\n",
304
       "      <th>998</th>\n",
305
       "      <td>P998</td>\n",
306
       "      <td>18</td>\n",
307
       "      <td>2</td>\n",
308
       "      <td>6</td>\n",
309
       "      <td>8</td>\n",
310
       "      <td>7</td>\n",
311
       "      <td>7</td>\n",
312
       "      <td>7</td>\n",
313
       "      <td>6</td>\n",
314
       "      <td>7</td>\n",
315
       "      <td>...</td>\n",
316
       "      <td>3</td>\n",
317
       "      <td>2</td>\n",
318
       "      <td>4</td>\n",
319
       "      <td>1</td>\n",
320
       "      <td>4</td>\n",
321
       "      <td>2</td>\n",
322
       "      <td>4</td>\n",
323
       "      <td>2</td>\n",
324
       "      <td>3</td>\n",
325
       "      <td>High</td>\n",
326
       "    </tr>\n",
327
       "    <tr>\n",
328
       "      <th>999</th>\n",
329
       "      <td>P999</td>\n",
330
       "      <td>47</td>\n",
331
       "      <td>1</td>\n",
332
       "      <td>6</td>\n",
333
       "      <td>5</td>\n",
334
       "      <td>6</td>\n",
335
       "      <td>5</td>\n",
336
       "      <td>5</td>\n",
337
       "      <td>4</td>\n",
338
       "      <td>6</td>\n",
339
       "      <td>...</td>\n",
340
       "      <td>8</td>\n",
341
       "      <td>7</td>\n",
342
       "      <td>9</td>\n",
343
       "      <td>2</td>\n",
344
       "      <td>1</td>\n",
345
       "      <td>4</td>\n",
346
       "      <td>6</td>\n",
347
       "      <td>7</td>\n",
348
       "      <td>2</td>\n",
349
       "      <td>High</td>\n",
350
       "    </tr>\n",
351
       "  </tbody>\n",
352
       "</table>\n",
353
       "<p>3 rows × 25 columns</p>\n",
354
       "</div>"
355
      ],
356
      "text/plain": [
357
       "    patient_id  age  gender  air_pollution  alcohol_use  dust_allergy  \\\n",
358
       "997       P997   25       2              4            5             6   \n",
359
       "998       P998   18       2              6            8             7   \n",
360
       "999       P999   47       1              6            5             6   \n",
361
       "\n",
362
       "     occupational_hazards  genetic_risk  chronic_lung_disease  balanced_diet  \\\n",
363
       "997                     5             5                     4              6   \n",
364
       "998                     7             7                     6              7   \n",
365
       "999                     5             5                     4              6   \n",
366
       "\n",
367
       "     ...    fatigue  weight_loss  shortness_of_breath  wheezing  \\\n",
368
       "997  ...          8            7                    9         2   \n",
369
       "998  ...          3            2                    4         1   \n",
370
       "999  ...          8            7                    9         2   \n",
371
       "\n",
372
       "     swallowing_difficulty  clubbing_of_finger_nails  frequent_cold  \\\n",
373
       "997                      1                         4              6   \n",
374
       "998                      4                         2              4   \n",
375
       "999                      1                         4              6   \n",
376
       "\n",
377
       "     dry_cough  snoring  level  \n",
378
       "997          7        2   High  \n",
379
       "998          2        3   High  \n",
380
       "999          7        2   High  \n",
381
       "\n",
382
       "[3 rows x 25 columns]"
383
      ]
384
     },
385
     "execution_count": 44,
386
     "metadata": {},
387
     "output_type": "execute_result"
388
    }
389
   ],
390
   "source": [
391
    "LUNG_CANCER.tail(3)"
392
   ]
393
  },
394
  {
395
   "cell_type": "markdown",
396
   "metadata": {},
397
   "source": [
398
    "<a id='msvl'></a>\n",
399
    "### Missing Values\n",
400
    "---\n",
401
    "1. we will drop the missing values if there is one"
402
   ]
403
  },
404
  {
405
   "cell_type": "code",
406
   "execution_count": 45,
407
   "metadata": {},
408
   "outputs": [
409
    {
410
     "data": {
411
      "text/plain": [
412
       "patient_id                  0\n",
413
       "age                         0\n",
414
       "gender                      0\n",
415
       "air_pollution               0\n",
416
       "alcohol_use                 0\n",
417
       "dust_allergy                0\n",
418
       "occupational_hazards        0\n",
419
       "genetic_risk                0\n",
420
       "chronic_lung_disease        0\n",
421
       "balanced_diet               0\n",
422
       "obesity                     0\n",
423
       "smoking                     0\n",
424
       "passive_smoker              0\n",
425
       "chest_pain                  0\n",
426
       "coughing_of_blood           0\n",
427
       "fatigue                     0\n",
428
       "weight_loss                 0\n",
429
       "shortness_of_breath         0\n",
430
       "wheezing                    0\n",
431
       "swallowing_difficulty       0\n",
432
       "clubbing_of_finger_nails    0\n",
433
       "frequent_cold               0\n",
434
       "dry_cough                   0\n",
435
       "snoring                     0\n",
436
       "level                       0\n",
437
       "dtype: int64"
438
      ]
439
     },
440
     "execution_count": 45,
441
     "metadata": {},
442
     "output_type": "execute_result"
443
    }
444
   ],
445
   "source": [
446
    "LUNG_CANCER.isnull().sum()"
447
   ]
448
  },
449
  {
450
   "cell_type": "code",
451
   "execution_count": 5,
452
   "metadata": {},
453
   "outputs": [],
454
   "source": [
455
    "LUNG_CANCER.dropna(inplace=True)"
456
   ]
457
  },
458
  {
459
   "cell_type": "code",
460
   "execution_count": 6,
461
   "metadata": {},
462
   "outputs": [
463
    {
464
     "data": {
465
      "text/plain": [
466
       "patient_id                  0\n",
467
       "age                         0\n",
468
       "gender                      0\n",
469
       "air_pollution               0\n",
470
       "alcohol_use                 0\n",
471
       "dust_allergy                0\n",
472
       "occupational_hazards        0\n",
473
       "genetic_risk                0\n",
474
       "chronic_lung_disease        0\n",
475
       "balanced_diet               0\n",
476
       "obesity                     0\n",
477
       "smoking                     0\n",
478
       "passive_smoker              0\n",
479
       "chest_pain                  0\n",
480
       "coughing_of_blood           0\n",
481
       "fatigue                     0\n",
482
       "weight_loss                 0\n",
483
       "shortness_of_breath         0\n",
484
       "wheezing                    0\n",
485
       "swallowing_difficulty       0\n",
486
       "clubbing_of_finger_nails    0\n",
487
       "frequent_cold               0\n",
488
       "dry_cough                   0\n",
489
       "snoring                     0\n",
490
       "level                       0\n",
491
       "dtype: int64"
492
      ]
493
     },
494
     "execution_count": 6,
495
     "metadata": {},
496
     "output_type": "execute_result"
497
    }
498
   ],
499
   "source": [
500
    "LUNG_CANCER.isnull().sum()"
501
   ]
502
  },
503
  {
504
   "cell_type": "markdown",
505
   "metadata": {},
506
   "source": [
507
    "<a id='implementation'></a>\n",
508
    "## Implementation of Logistic Regression\n",
509
    "---\n"
510
   ]
511
  },
512
  {
513
   "cell_type": "markdown",
514
   "metadata": {},
515
   "source": [
516
    "<a id='LEVEL'></a>\n",
517
    "### Level: Low, Medium, High\n",
518
    "---"
519
   ]
520
  },
521
  {
522
   "cell_type": "code",
523
   "execution_count": 7,
524
   "metadata": {},
525
   "outputs": [
526
    {
527
     "data": {
528
      "text/html": [
529
       "<div>\n",
530
       "<style scoped>\n",
531
       "    .dataframe tbody tr th:only-of-type {\n",
532
       "        vertical-align: middle;\n",
533
       "    }\n",
534
       "\n",
535
       "    .dataframe tbody tr th {\n",
536
       "        vertical-align: top;\n",
537
       "    }\n",
538
       "\n",
539
       "    .dataframe thead th {\n",
540
       "        text-align: right;\n",
541
       "    }\n",
542
       "</style>\n",
543
       "<table border=\"1\" class=\"dataframe\">\n",
544
       "  <thead>\n",
545
       "    <tr style=\"text-align: right;\">\n",
546
       "      <th></th>\n",
547
       "      <th>age</th>\n",
548
       "      <th>gender</th>\n",
549
       "      <th>air_pollution</th>\n",
550
       "      <th>alcohol_use</th>\n",
551
       "      <th>dust_allergy</th>\n",
552
       "      <th>occupational_hazards</th>\n",
553
       "      <th>genetic_risk</th>\n",
554
       "      <th>chronic_lung_disease</th>\n",
555
       "      <th>balanced_diet</th>\n",
556
       "      <th>obesity</th>\n",
557
       "      <th>...</th>\n",
558
       "      <th>coughing_of_blood</th>\n",
559
       "      <th>fatigue</th>\n",
560
       "      <th>weight_loss</th>\n",
561
       "      <th>shortness_of_breath</th>\n",
562
       "      <th>wheezing</th>\n",
563
       "      <th>swallowing_difficulty</th>\n",
564
       "      <th>clubbing_of_finger_nails</th>\n",
565
       "      <th>frequent_cold</th>\n",
566
       "      <th>dry_cough</th>\n",
567
       "      <th>snoring</th>\n",
568
       "    </tr>\n",
569
       "  </thead>\n",
570
       "  <tbody>\n",
571
       "    <tr>\n",
572
       "      <th>count</th>\n",
573
       "      <td>1000.000000</td>\n",
574
       "      <td>1000.000000</td>\n",
575
       "      <td>1000.0000</td>\n",
576
       "      <td>1000.000000</td>\n",
577
       "      <td>1000.000000</td>\n",
578
       "      <td>1000.000000</td>\n",
579
       "      <td>1000.000000</td>\n",
580
       "      <td>1000.000000</td>\n",
581
       "      <td>1000.000000</td>\n",
582
       "      <td>1000.000000</td>\n",
583
       "      <td>...</td>\n",
584
       "      <td>1000.000000</td>\n",
585
       "      <td>1000.000000</td>\n",
586
       "      <td>1000.000000</td>\n",
587
       "      <td>1000.000000</td>\n",
588
       "      <td>1000.000000</td>\n",
589
       "      <td>1000.000000</td>\n",
590
       "      <td>1000.000000</td>\n",
591
       "      <td>1000.000000</td>\n",
592
       "      <td>1000.000000</td>\n",
593
       "      <td>1000.000000</td>\n",
594
       "    </tr>\n",
595
       "    <tr>\n",
596
       "      <th>mean</th>\n",
597
       "      <td>37.174000</td>\n",
598
       "      <td>1.402000</td>\n",
599
       "      <td>3.8400</td>\n",
600
       "      <td>4.563000</td>\n",
601
       "      <td>5.165000</td>\n",
602
       "      <td>4.840000</td>\n",
603
       "      <td>4.580000</td>\n",
604
       "      <td>4.380000</td>\n",
605
       "      <td>4.491000</td>\n",
606
       "      <td>4.465000</td>\n",
607
       "      <td>...</td>\n",
608
       "      <td>4.859000</td>\n",
609
       "      <td>3.856000</td>\n",
610
       "      <td>3.855000</td>\n",
611
       "      <td>4.240000</td>\n",
612
       "      <td>3.777000</td>\n",
613
       "      <td>3.746000</td>\n",
614
       "      <td>3.923000</td>\n",
615
       "      <td>3.536000</td>\n",
616
       "      <td>3.853000</td>\n",
617
       "      <td>2.926000</td>\n",
618
       "    </tr>\n",
619
       "    <tr>\n",
620
       "      <th>std</th>\n",
621
       "      <td>12.005493</td>\n",
622
       "      <td>0.490547</td>\n",
623
       "      <td>2.0304</td>\n",
624
       "      <td>2.620477</td>\n",
625
       "      <td>1.980833</td>\n",
626
       "      <td>2.107805</td>\n",
627
       "      <td>2.126999</td>\n",
628
       "      <td>1.848518</td>\n",
629
       "      <td>2.135528</td>\n",
630
       "      <td>2.124921</td>\n",
631
       "      <td>...</td>\n",
632
       "      <td>2.427965</td>\n",
633
       "      <td>2.244616</td>\n",
634
       "      <td>2.206546</td>\n",
635
       "      <td>2.285087</td>\n",
636
       "      <td>2.041921</td>\n",
637
       "      <td>2.270383</td>\n",
638
       "      <td>2.388048</td>\n",
639
       "      <td>1.832502</td>\n",
640
       "      <td>2.039007</td>\n",
641
       "      <td>1.474686</td>\n",
642
       "    </tr>\n",
643
       "    <tr>\n",
644
       "      <th>min</th>\n",
645
       "      <td>14.000000</td>\n",
646
       "      <td>1.000000</td>\n",
647
       "      <td>1.0000</td>\n",
648
       "      <td>1.000000</td>\n",
649
       "      <td>1.000000</td>\n",
650
       "      <td>1.000000</td>\n",
651
       "      <td>1.000000</td>\n",
652
       "      <td>1.000000</td>\n",
653
       "      <td>1.000000</td>\n",
654
       "      <td>1.000000</td>\n",
655
       "      <td>...</td>\n",
656
       "      <td>1.000000</td>\n",
657
       "      <td>1.000000</td>\n",
658
       "      <td>1.000000</td>\n",
659
       "      <td>1.000000</td>\n",
660
       "      <td>1.000000</td>\n",
661
       "      <td>1.000000</td>\n",
662
       "      <td>1.000000</td>\n",
663
       "      <td>1.000000</td>\n",
664
       "      <td>1.000000</td>\n",
665
       "      <td>1.000000</td>\n",
666
       "    </tr>\n",
667
       "    <tr>\n",
668
       "      <th>25%</th>\n",
669
       "      <td>27.750000</td>\n",
670
       "      <td>1.000000</td>\n",
671
       "      <td>2.0000</td>\n",
672
       "      <td>2.000000</td>\n",
673
       "      <td>4.000000</td>\n",
674
       "      <td>3.000000</td>\n",
675
       "      <td>2.000000</td>\n",
676
       "      <td>3.000000</td>\n",
677
       "      <td>2.000000</td>\n",
678
       "      <td>3.000000</td>\n",
679
       "      <td>...</td>\n",
680
       "      <td>3.000000</td>\n",
681
       "      <td>2.000000</td>\n",
682
       "      <td>2.000000</td>\n",
683
       "      <td>2.000000</td>\n",
684
       "      <td>2.000000</td>\n",
685
       "      <td>2.000000</td>\n",
686
       "      <td>2.000000</td>\n",
687
       "      <td>2.000000</td>\n",
688
       "      <td>2.000000</td>\n",
689
       "      <td>2.000000</td>\n",
690
       "    </tr>\n",
691
       "    <tr>\n",
692
       "      <th>50%</th>\n",
693
       "      <td>36.000000</td>\n",
694
       "      <td>1.000000</td>\n",
695
       "      <td>3.0000</td>\n",
696
       "      <td>5.000000</td>\n",
697
       "      <td>6.000000</td>\n",
698
       "      <td>5.000000</td>\n",
699
       "      <td>5.000000</td>\n",
700
       "      <td>4.000000</td>\n",
701
       "      <td>4.000000</td>\n",
702
       "      <td>4.000000</td>\n",
703
       "      <td>...</td>\n",
704
       "      <td>4.000000</td>\n",
705
       "      <td>3.000000</td>\n",
706
       "      <td>3.000000</td>\n",
707
       "      <td>4.000000</td>\n",
708
       "      <td>4.000000</td>\n",
709
       "      <td>4.000000</td>\n",
710
       "      <td>4.000000</td>\n",
711
       "      <td>3.000000</td>\n",
712
       "      <td>4.000000</td>\n",
713
       "      <td>3.000000</td>\n",
714
       "    </tr>\n",
715
       "    <tr>\n",
716
       "      <th>75%</th>\n",
717
       "      <td>45.000000</td>\n",
718
       "      <td>2.000000</td>\n",
719
       "      <td>6.0000</td>\n",
720
       "      <td>7.000000</td>\n",
721
       "      <td>7.000000</td>\n",
722
       "      <td>7.000000</td>\n",
723
       "      <td>7.000000</td>\n",
724
       "      <td>6.000000</td>\n",
725
       "      <td>7.000000</td>\n",
726
       "      <td>7.000000</td>\n",
727
       "      <td>...</td>\n",
728
       "      <td>7.000000</td>\n",
729
       "      <td>5.000000</td>\n",
730
       "      <td>6.000000</td>\n",
731
       "      <td>6.000000</td>\n",
732
       "      <td>5.000000</td>\n",
733
       "      <td>5.000000</td>\n",
734
       "      <td>5.000000</td>\n",
735
       "      <td>5.000000</td>\n",
736
       "      <td>6.000000</td>\n",
737
       "      <td>4.000000</td>\n",
738
       "    </tr>\n",
739
       "    <tr>\n",
740
       "      <th>max</th>\n",
741
       "      <td>73.000000</td>\n",
742
       "      <td>2.000000</td>\n",
743
       "      <td>8.0000</td>\n",
744
       "      <td>8.000000</td>\n",
745
       "      <td>8.000000</td>\n",
746
       "      <td>8.000000</td>\n",
747
       "      <td>7.000000</td>\n",
748
       "      <td>7.000000</td>\n",
749
       "      <td>7.000000</td>\n",
750
       "      <td>7.000000</td>\n",
751
       "      <td>...</td>\n",
752
       "      <td>9.000000</td>\n",
753
       "      <td>9.000000</td>\n",
754
       "      <td>8.000000</td>\n",
755
       "      <td>9.000000</td>\n",
756
       "      <td>8.000000</td>\n",
757
       "      <td>8.000000</td>\n",
758
       "      <td>9.000000</td>\n",
759
       "      <td>7.000000</td>\n",
760
       "      <td>7.000000</td>\n",
761
       "      <td>7.000000</td>\n",
762
       "    </tr>\n",
763
       "  </tbody>\n",
764
       "</table>\n",
765
       "<p>8 rows × 23 columns</p>\n",
766
       "</div>"
767
      ],
768
      "text/plain": [
769
       "               age       gender  air_pollution  alcohol_use  dust_allergy  \\\n",
770
       "count  1000.000000  1000.000000      1000.0000  1000.000000   1000.000000   \n",
771
       "mean     37.174000     1.402000         3.8400     4.563000      5.165000   \n",
772
       "std      12.005493     0.490547         2.0304     2.620477      1.980833   \n",
773
       "min      14.000000     1.000000         1.0000     1.000000      1.000000   \n",
774
       "25%      27.750000     1.000000         2.0000     2.000000      4.000000   \n",
775
       "50%      36.000000     1.000000         3.0000     5.000000      6.000000   \n",
776
       "75%      45.000000     2.000000         6.0000     7.000000      7.000000   \n",
777
       "max      73.000000     2.000000         8.0000     8.000000      8.000000   \n",
778
       "\n",
779
       "       occupational_hazards  genetic_risk  chronic_lung_disease  \\\n",
780
       "count           1000.000000   1000.000000           1000.000000   \n",
781
       "mean               4.840000      4.580000              4.380000   \n",
782
       "std                2.107805      2.126999              1.848518   \n",
783
       "min                1.000000      1.000000              1.000000   \n",
784
       "25%                3.000000      2.000000              3.000000   \n",
785
       "50%                5.000000      5.000000              4.000000   \n",
786
       "75%                7.000000      7.000000              6.000000   \n",
787
       "max                8.000000      7.000000              7.000000   \n",
788
       "\n",
789
       "       balanced_diet      obesity     ...       coughing_of_blood  \\\n",
790
       "count    1000.000000  1000.000000     ...             1000.000000   \n",
791
       "mean        4.491000     4.465000     ...                4.859000   \n",
792
       "std         2.135528     2.124921     ...                2.427965   \n",
793
       "min         1.000000     1.000000     ...                1.000000   \n",
794
       "25%         2.000000     3.000000     ...                3.000000   \n",
795
       "50%         4.000000     4.000000     ...                4.000000   \n",
796
       "75%         7.000000     7.000000     ...                7.000000   \n",
797
       "max         7.000000     7.000000     ...                9.000000   \n",
798
       "\n",
799
       "           fatigue  weight_loss  shortness_of_breath     wheezing  \\\n",
800
       "count  1000.000000  1000.000000          1000.000000  1000.000000   \n",
801
       "mean      3.856000     3.855000             4.240000     3.777000   \n",
802
       "std       2.244616     2.206546             2.285087     2.041921   \n",
803
       "min       1.000000     1.000000             1.000000     1.000000   \n",
804
       "25%       2.000000     2.000000             2.000000     2.000000   \n",
805
       "50%       3.000000     3.000000             4.000000     4.000000   \n",
806
       "75%       5.000000     6.000000             6.000000     5.000000   \n",
807
       "max       9.000000     8.000000             9.000000     8.000000   \n",
808
       "\n",
809
       "       swallowing_difficulty  clubbing_of_finger_nails  frequent_cold  \\\n",
810
       "count            1000.000000               1000.000000    1000.000000   \n",
811
       "mean                3.746000                  3.923000       3.536000   \n",
812
       "std                 2.270383                  2.388048       1.832502   \n",
813
       "min                 1.000000                  1.000000       1.000000   \n",
814
       "25%                 2.000000                  2.000000       2.000000   \n",
815
       "50%                 4.000000                  4.000000       3.000000   \n",
816
       "75%                 5.000000                  5.000000       5.000000   \n",
817
       "max                 8.000000                  9.000000       7.000000   \n",
818
       "\n",
819
       "         dry_cough      snoring  \n",
820
       "count  1000.000000  1000.000000  \n",
821
       "mean      3.853000     2.926000  \n",
822
       "std       2.039007     1.474686  \n",
823
       "min       1.000000     1.000000  \n",
824
       "25%       2.000000     2.000000  \n",
825
       "50%       4.000000     3.000000  \n",
826
       "75%       6.000000     4.000000  \n",
827
       "max       7.000000     7.000000  \n",
828
       "\n",
829
       "[8 rows x 23 columns]"
830
      ]
831
     },
832
     "execution_count": 7,
833
     "metadata": {},
834
     "output_type": "execute_result"
835
    }
836
   ],
837
   "source": [
838
    "## Describe our dataset\n",
839
    "LUNG_CANCER.describe()"
840
   ]
841
  },
842
  {
843
   "cell_type": "code",
844
   "execution_count": 8,
845
   "metadata": {},
846
   "outputs": [
847
    {
848
     "name": "stdout",
849
     "output_type": "stream",
850
     "text": [
851
      "<class 'pandas.core.frame.DataFrame'>\n",
852
      "Int64Index: 1000 entries, 0 to 999\n",
853
      "Data columns (total 25 columns):\n",
854
      "patient_id                  1000 non-null object\n",
855
      "age                         1000 non-null int64\n",
856
      "gender                      1000 non-null int64\n",
857
      "air_pollution               1000 non-null int64\n",
858
      "alcohol_use                 1000 non-null int64\n",
859
      "dust_allergy                1000 non-null int64\n",
860
      "occupational_hazards        1000 non-null int64\n",
861
      "genetic_risk                1000 non-null int64\n",
862
      "chronic_lung_disease        1000 non-null int64\n",
863
      "balanced_diet               1000 non-null int64\n",
864
      "obesity                     1000 non-null int64\n",
865
      "smoking                     1000 non-null int64\n",
866
      "passive_smoker              1000 non-null int64\n",
867
      "chest_pain                  1000 non-null int64\n",
868
      "coughing_of_blood           1000 non-null int64\n",
869
      "fatigue                     1000 non-null int64\n",
870
      "weight_loss                 1000 non-null int64\n",
871
      "shortness_of_breath         1000 non-null int64\n",
872
      "wheezing                    1000 non-null int64\n",
873
      "swallowing_difficulty       1000 non-null int64\n",
874
      "clubbing_of_finger_nails    1000 non-null int64\n",
875
      "frequent_cold               1000 non-null int64\n",
876
      "dry_cough                   1000 non-null int64\n",
877
      "snoring                     1000 non-null int64\n",
878
      "level                       1000 non-null object\n",
879
      "dtypes: int64(23), object(2)\n",
880
      "memory usage: 203.1+ KB\n"
881
     ]
882
    }
883
   ],
884
   "source": [
885
    "LUNG_CANCER.info()"
886
   ]
887
  },
888
  {
889
   "cell_type": "markdown",
890
   "metadata": {},
891
   "source": [
892
    "### here we have a categorical Column in our dataset which is Level"
893
   ]
894
  },
895
  {
896
   "cell_type": "code",
897
   "execution_count": 9,
898
   "metadata": {},
899
   "outputs": [
900
    {
901
     "data": {
902
      "text/html": [
903
       "<div>\n",
904
       "<style scoped>\n",
905
       "    .dataframe tbody tr th:only-of-type {\n",
906
       "        vertical-align: middle;\n",
907
       "    }\n",
908
       "\n",
909
       "    .dataframe tbody tr th {\n",
910
       "        vertical-align: top;\n",
911
       "    }\n",
912
       "\n",
913
       "    .dataframe thead th {\n",
914
       "        text-align: right;\n",
915
       "    }\n",
916
       "</style>\n",
917
       "<table border=\"1\" class=\"dataframe\">\n",
918
       "  <thead>\n",
919
       "    <tr style=\"text-align: right;\">\n",
920
       "      <th></th>\n",
921
       "      <th>patient_id</th>\n",
922
       "      <th>age</th>\n",
923
       "      <th>gender</th>\n",
924
       "      <th>air_pollution</th>\n",
925
       "      <th>alcohol_use</th>\n",
926
       "      <th>dust_allergy</th>\n",
927
       "      <th>occupational_hazards</th>\n",
928
       "      <th>genetic_risk</th>\n",
929
       "      <th>chronic_lung_disease</th>\n",
930
       "      <th>balanced_diet</th>\n",
931
       "      <th>...</th>\n",
932
       "      <th>fatigue</th>\n",
933
       "      <th>weight_loss</th>\n",
934
       "      <th>shortness_of_breath</th>\n",
935
       "      <th>wheezing</th>\n",
936
       "      <th>swallowing_difficulty</th>\n",
937
       "      <th>clubbing_of_finger_nails</th>\n",
938
       "      <th>frequent_cold</th>\n",
939
       "      <th>dry_cough</th>\n",
940
       "      <th>snoring</th>\n",
941
       "      <th>level</th>\n",
942
       "    </tr>\n",
943
       "  </thead>\n",
944
       "  <tbody>\n",
945
       "    <tr>\n",
946
       "      <th>0</th>\n",
947
       "      <td>P1</td>\n",
948
       "      <td>33</td>\n",
949
       "      <td>1</td>\n",
950
       "      <td>2</td>\n",
951
       "      <td>4</td>\n",
952
       "      <td>5</td>\n",
953
       "      <td>4</td>\n",
954
       "      <td>3</td>\n",
955
       "      <td>2</td>\n",
956
       "      <td>2</td>\n",
957
       "      <td>...</td>\n",
958
       "      <td>3</td>\n",
959
       "      <td>4</td>\n",
960
       "      <td>2</td>\n",
961
       "      <td>2</td>\n",
962
       "      <td>3</td>\n",
963
       "      <td>1</td>\n",
964
       "      <td>2</td>\n",
965
       "      <td>3</td>\n",
966
       "      <td>4</td>\n",
967
       "      <td>1</td>\n",
968
       "    </tr>\n",
969
       "    <tr>\n",
970
       "      <th>1</th>\n",
971
       "      <td>P10</td>\n",
972
       "      <td>17</td>\n",
973
       "      <td>1</td>\n",
974
       "      <td>3</td>\n",
975
       "      <td>1</td>\n",
976
       "      <td>5</td>\n",
977
       "      <td>3</td>\n",
978
       "      <td>4</td>\n",
979
       "      <td>2</td>\n",
980
       "      <td>2</td>\n",
981
       "      <td>...</td>\n",
982
       "      <td>1</td>\n",
983
       "      <td>3</td>\n",
984
       "      <td>7</td>\n",
985
       "      <td>8</td>\n",
986
       "      <td>6</td>\n",
987
       "      <td>2</td>\n",
988
       "      <td>1</td>\n",
989
       "      <td>7</td>\n",
990
       "      <td>2</td>\n",
991
       "      <td>2</td>\n",
992
       "    </tr>\n",
993
       "    <tr>\n",
994
       "      <th>2</th>\n",
995
       "      <td>P107</td>\n",
996
       "      <td>44</td>\n",
997
       "      <td>1</td>\n",
998
       "      <td>6</td>\n",
999
       "      <td>7</td>\n",
1000
       "      <td>7</td>\n",
1001
       "      <td>7</td>\n",
1002
       "      <td>7</td>\n",
1003
       "      <td>6</td>\n",
1004
       "      <td>7</td>\n",
1005
       "      <td>...</td>\n",
1006
       "      <td>5</td>\n",
1007
       "      <td>3</td>\n",
1008
       "      <td>2</td>\n",
1009
       "      <td>7</td>\n",
1010
       "      <td>8</td>\n",
1011
       "      <td>2</td>\n",
1012
       "      <td>4</td>\n",
1013
       "      <td>5</td>\n",
1014
       "      <td>3</td>\n",
1015
       "      <td>3</td>\n",
1016
       "    </tr>\n",
1017
       "    <tr>\n",
1018
       "      <th>3</th>\n",
1019
       "      <td>P189</td>\n",
1020
       "      <td>39</td>\n",
1021
       "      <td>2</td>\n",
1022
       "      <td>6</td>\n",
1023
       "      <td>8</td>\n",
1024
       "      <td>7</td>\n",
1025
       "      <td>7</td>\n",
1026
       "      <td>7</td>\n",
1027
       "      <td>6</td>\n",
1028
       "      <td>7</td>\n",
1029
       "      <td>...</td>\n",
1030
       "      <td>3</td>\n",
1031
       "      <td>2</td>\n",
1032
       "      <td>4</td>\n",
1033
       "      <td>1</td>\n",
1034
       "      <td>4</td>\n",
1035
       "      <td>2</td>\n",
1036
       "      <td>4</td>\n",
1037
       "      <td>2</td>\n",
1038
       "      <td>3</td>\n",
1039
       "      <td>3</td>\n",
1040
       "    </tr>\n",
1041
       "  </tbody>\n",
1042
       "</table>\n",
1043
       "<p>4 rows × 25 columns</p>\n",
1044
       "</div>"
1045
      ],
1046
      "text/plain": [
1047
       "  patient_id  age  gender  air_pollution  alcohol_use  dust_allergy  \\\n",
1048
       "0         P1   33       1              2            4             5   \n",
1049
       "1        P10   17       1              3            1             5   \n",
1050
       "2       P107   44       1              6            7             7   \n",
1051
       "3       P189   39       2              6            8             7   \n",
1052
       "\n",
1053
       "   occupational_hazards  genetic_risk  chronic_lung_disease  balanced_diet  \\\n",
1054
       "0                     4             3                     2              2   \n",
1055
       "1                     3             4                     2              2   \n",
1056
       "2                     7             7                     6              7   \n",
1057
       "3                     7             7                     6              7   \n",
1058
       "\n",
1059
       "   ...    fatigue  weight_loss  shortness_of_breath  wheezing  \\\n",
1060
       "0  ...          3            4                    2         2   \n",
1061
       "1  ...          1            3                    7         8   \n",
1062
       "2  ...          5            3                    2         7   \n",
1063
       "3  ...          3            2                    4         1   \n",
1064
       "\n",
1065
       "   swallowing_difficulty  clubbing_of_finger_nails  frequent_cold  dry_cough  \\\n",
1066
       "0                      3                         1              2          3   \n",
1067
       "1                      6                         2              1          7   \n",
1068
       "2                      8                         2              4          5   \n",
1069
       "3                      4                         2              4          2   \n",
1070
       "\n",
1071
       "   snoring  level  \n",
1072
       "0        4      1  \n",
1073
       "1        2      2  \n",
1074
       "2        3      3  \n",
1075
       "3        3      3  \n",
1076
       "\n",
1077
       "[4 rows x 25 columns]"
1078
      ]
1079
     },
1080
     "execution_count": 9,
1081
     "metadata": {},
1082
     "output_type": "execute_result"
1083
    }
1084
   ],
1085
   "source": [
1086
    "def data_cleaning(data):\n",
1087
    "    LUNG_CANCER[\"age\"]=data[\"age\"].fillna(LUNG_CANCER[\"age\"].median())\n",
1088
    "    \n",
1089
    "    \n",
1090
    "    LUNG_CANCER.loc[data[\"level\"]==\"Low\",\"level\"]=1\n",
1091
    "    LUNG_CANCER.loc[data[\"level\"]==\"Medium\",\"level\"]=2\n",
1092
    "    LUNG_CANCER.loc[data[\"level\"]==\"High\",\"level\"]=3\n",
1093
    "    \n",
1094
    "    return data\n",
1095
    "\n",
1096
    "LUNG_CANCER=data_cleaning(LUNG_CANCER)\n",
1097
    "LUNG_CANCER.head(4)"
1098
   ]
1099
  },
1100
  {
1101
   "cell_type": "code",
1102
   "execution_count": 10,
1103
   "metadata": {},
1104
   "outputs": [
1105
    {
1106
     "data": {
1107
      "text/html": [
1108
       "<div>\n",
1109
       "<style scoped>\n",
1110
       "    .dataframe tbody tr th:only-of-type {\n",
1111
       "        vertical-align: middle;\n",
1112
       "    }\n",
1113
       "\n",
1114
       "    .dataframe tbody tr th {\n",
1115
       "        vertical-align: top;\n",
1116
       "    }\n",
1117
       "\n",
1118
       "    .dataframe thead th {\n",
1119
       "        text-align: right;\n",
1120
       "    }\n",
1121
       "</style>\n",
1122
       "<table border=\"1\" class=\"dataframe\">\n",
1123
       "  <thead>\n",
1124
       "    <tr style=\"text-align: right;\">\n",
1125
       "      <th></th>\n",
1126
       "      <th>patient_id</th>\n",
1127
       "      <th>age</th>\n",
1128
       "      <th>gender</th>\n",
1129
       "      <th>air_pollution</th>\n",
1130
       "      <th>alcohol_use</th>\n",
1131
       "      <th>dust_allergy</th>\n",
1132
       "      <th>occupational_hazards</th>\n",
1133
       "      <th>genetic_risk</th>\n",
1134
       "      <th>chronic_lung_disease</th>\n",
1135
       "      <th>balanced_diet</th>\n",
1136
       "      <th>...</th>\n",
1137
       "      <th>weight_loss</th>\n",
1138
       "      <th>shortness_of_breath</th>\n",
1139
       "      <th>wheezing</th>\n",
1140
       "      <th>swallowing_difficulty</th>\n",
1141
       "      <th>clubbing_of_finger_nails</th>\n",
1142
       "      <th>frequent_cold</th>\n",
1143
       "      <th>dry_cough</th>\n",
1144
       "      <th>snoring</th>\n",
1145
       "      <th>level</th>\n",
1146
       "      <th>intercept</th>\n",
1147
       "    </tr>\n",
1148
       "  </thead>\n",
1149
       "  <tbody>\n",
1150
       "    <tr>\n",
1151
       "      <th>0</th>\n",
1152
       "      <td>P1</td>\n",
1153
       "      <td>33</td>\n",
1154
       "      <td>1</td>\n",
1155
       "      <td>2</td>\n",
1156
       "      <td>4</td>\n",
1157
       "      <td>5</td>\n",
1158
       "      <td>4</td>\n",
1159
       "      <td>3</td>\n",
1160
       "      <td>2</td>\n",
1161
       "      <td>2</td>\n",
1162
       "      <td>...</td>\n",
1163
       "      <td>4</td>\n",
1164
       "      <td>2</td>\n",
1165
       "      <td>2</td>\n",
1166
       "      <td>3</td>\n",
1167
       "      <td>1</td>\n",
1168
       "      <td>2</td>\n",
1169
       "      <td>3</td>\n",
1170
       "      <td>4</td>\n",
1171
       "      <td>1</td>\n",
1172
       "      <td>1.0</td>\n",
1173
       "    </tr>\n",
1174
       "    <tr>\n",
1175
       "      <th>1</th>\n",
1176
       "      <td>P10</td>\n",
1177
       "      <td>17</td>\n",
1178
       "      <td>1</td>\n",
1179
       "      <td>3</td>\n",
1180
       "      <td>1</td>\n",
1181
       "      <td>5</td>\n",
1182
       "      <td>3</td>\n",
1183
       "      <td>4</td>\n",
1184
       "      <td>2</td>\n",
1185
       "      <td>2</td>\n",
1186
       "      <td>...</td>\n",
1187
       "      <td>3</td>\n",
1188
       "      <td>7</td>\n",
1189
       "      <td>8</td>\n",
1190
       "      <td>6</td>\n",
1191
       "      <td>2</td>\n",
1192
       "      <td>1</td>\n",
1193
       "      <td>7</td>\n",
1194
       "      <td>2</td>\n",
1195
       "      <td>2</td>\n",
1196
       "      <td>1.0</td>\n",
1197
       "    </tr>\n",
1198
       "    <tr>\n",
1199
       "      <th>2</th>\n",
1200
       "      <td>P107</td>\n",
1201
       "      <td>44</td>\n",
1202
       "      <td>1</td>\n",
1203
       "      <td>6</td>\n",
1204
       "      <td>7</td>\n",
1205
       "      <td>7</td>\n",
1206
       "      <td>7</td>\n",
1207
       "      <td>7</td>\n",
1208
       "      <td>6</td>\n",
1209
       "      <td>7</td>\n",
1210
       "      <td>...</td>\n",
1211
       "      <td>3</td>\n",
1212
       "      <td>2</td>\n",
1213
       "      <td>7</td>\n",
1214
       "      <td>8</td>\n",
1215
       "      <td>2</td>\n",
1216
       "      <td>4</td>\n",
1217
       "      <td>5</td>\n",
1218
       "      <td>3</td>\n",
1219
       "      <td>3</td>\n",
1220
       "      <td>1.0</td>\n",
1221
       "    </tr>\n",
1222
       "    <tr>\n",
1223
       "      <th>3</th>\n",
1224
       "      <td>P189</td>\n",
1225
       "      <td>39</td>\n",
1226
       "      <td>2</td>\n",
1227
       "      <td>6</td>\n",
1228
       "      <td>8</td>\n",
1229
       "      <td>7</td>\n",
1230
       "      <td>7</td>\n",
1231
       "      <td>7</td>\n",
1232
       "      <td>6</td>\n",
1233
       "      <td>7</td>\n",
1234
       "      <td>...</td>\n",
1235
       "      <td>2</td>\n",
1236
       "      <td>4</td>\n",
1237
       "      <td>1</td>\n",
1238
       "      <td>4</td>\n",
1239
       "      <td>2</td>\n",
1240
       "      <td>4</td>\n",
1241
       "      <td>2</td>\n",
1242
       "      <td>3</td>\n",
1243
       "      <td>3</td>\n",
1244
       "      <td>1.0</td>\n",
1245
       "    </tr>\n",
1246
       "  </tbody>\n",
1247
       "</table>\n",
1248
       "<p>4 rows × 26 columns</p>\n",
1249
       "</div>"
1250
      ],
1251
      "text/plain": [
1252
       "  patient_id  age  gender  air_pollution  alcohol_use  dust_allergy  \\\n",
1253
       "0         P1   33       1              2            4             5   \n",
1254
       "1        P10   17       1              3            1             5   \n",
1255
       "2       P107   44       1              6            7             7   \n",
1256
       "3       P189   39       2              6            8             7   \n",
1257
       "\n",
1258
       "   occupational_hazards  genetic_risk  chronic_lung_disease  balanced_diet  \\\n",
1259
       "0                     4             3                     2              2   \n",
1260
       "1                     3             4                     2              2   \n",
1261
       "2                     7             7                     6              7   \n",
1262
       "3                     7             7                     6              7   \n",
1263
       "\n",
1264
       "     ...      weight_loss  shortness_of_breath  wheezing  \\\n",
1265
       "0    ...                4                    2         2   \n",
1266
       "1    ...                3                    7         8   \n",
1267
       "2    ...                3                    2         7   \n",
1268
       "3    ...                2                    4         1   \n",
1269
       "\n",
1270
       "   swallowing_difficulty  clubbing_of_finger_nails  frequent_cold  dry_cough  \\\n",
1271
       "0                      3                         1              2          3   \n",
1272
       "1                      6                         2              1          7   \n",
1273
       "2                      8                         2              4          5   \n",
1274
       "3                      4                         2              4          2   \n",
1275
       "\n",
1276
       "   snoring  level  intercept  \n",
1277
       "0        4      1        1.0  \n",
1278
       "1        2      2        1.0  \n",
1279
       "2        3      3        1.0  \n",
1280
       "3        3      3        1.0  \n",
1281
       "\n",
1282
       "[4 rows x 26 columns]"
1283
      ]
1284
     },
1285
     "execution_count": 10,
1286
     "metadata": {},
1287
     "output_type": "execute_result"
1288
    }
1289
   ],
1290
   "source": [
1291
    "## now lets add the intercept\n",
1292
    "\n",
1293
    "LUNG_CANCER['intercept'] = 1.0\n",
1294
    "\n",
1295
    "## we have a dataset that is ready for analysis\n",
1296
    "LUNG_CANCER.head(4)"
1297
   ]
1298
  },
1299
  {
1300
   "cell_type": "code",
1301
   "execution_count": 11,
1302
   "metadata": {},
1303
   "outputs": [
1304
    {
1305
     "name": "stdout",
1306
     "output_type": "stream",
1307
     "text": [
1308
      "(1000, 17) (1000,)\n",
1309
      "(700, 17) (700,)\n",
1310
      "(300, 17) (300,)\n"
1311
     ]
1312
    },
1313
    {
1314
     "name": "stderr",
1315
     "output_type": "stream",
1316
     "text": [
1317
      "C:\\Users\\Lina\\Anaconda3\\lib\\site-packages\\sklearn\\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
1318
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
1319
     ]
1320
    }
1321
   ],
1322
   "source": [
1323
    "'''Define y and X'''\n",
1324
    "y = LUNG_CANCER['level'] \n",
1325
    "columns_ = LUNG_CANCER.columns.tolist()\n",
1326
    "exclude_col = ['level','patient_id','alcohol_use','dust_allergy','occupational_hazards','balanced_diet','obesity','snoring','frequent_cold']\n",
1327
    "X = LUNG_CANCER[[i for i in columns_ if i not in exclude_col]]\n",
1328
    "X = st.add_constant(X, prepend = False) \n",
1329
    "print (X.shape, y.shape)\n",
1330
    "\n",
1331
    "'''Split the data'''\n",
1332
    "from sklearn.cross_validation import train_test_split\n",
1333
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)\n",
1334
    "\n",
1335
    "print (X_train.shape, y_train.shape)\n",
1336
    "print (X_test.shape, y_test.shape)"
1337
   ]
1338
  },
1339
  {
1340
   "cell_type": "code",
1341
   "execution_count": 12,
1342
   "metadata": {},
1343
   "outputs": [
1344
    {
1345
     "data": {
1346
      "text/plain": [
1347
       "Index(['patient_id', 'age', 'gender', 'air_pollution', 'alcohol_use',\n",
1348
       "       'dust_allergy', 'occupational_hazards', 'genetic_risk',\n",
1349
       "       'chronic_lung_disease', 'balanced_diet', 'obesity', 'smoking',\n",
1350
       "       'passive_smoker', 'chest_pain', 'coughing_of_blood', 'fatigue',\n",
1351
       "       'weight_loss', 'shortness_of_breath', 'wheezing',\n",
1352
       "       'swallowing_difficulty', 'clubbing_of_finger_nails', 'frequent_cold',\n",
1353
       "       'dry_cough', 'snoring', 'level', 'intercept'],\n",
1354
       "      dtype='object')"
1355
      ]
1356
     },
1357
     "execution_count": 12,
1358
     "metadata": {},
1359
     "output_type": "execute_result"
1360
    }
1361
   ],
1362
   "source": [
1363
    "LUNG_CANCER.columns"
1364
   ]
1365
  },
1366
  {
1367
   "cell_type": "code",
1368
   "execution_count": 13,
1369
   "metadata": {},
1370
   "outputs": [
1371
    {
1372
     "data": {
1373
      "text/plain": [
1374
       "array([[-3.17685199e-02,  1.47464418e-01,  1.91315555e-01,\n",
1375
       "        -1.07116609e+00,  3.29164839e-02,  6.86628736e-02,\n",
1376
       "        -1.37516216e+00,  7.06189546e-01, -1.28525724e+00,\n",
1377
       "        -1.53738658e+00,  1.03157677e-01, -3.54326295e-01,\n",
1378
       "        -9.39638735e-01, -7.18157437e-01, -1.24079736e+00,\n",
1379
       "        -4.89756753e-01,  2.93152389e-06],\n",
1380
       "       [ 3.66812895e-02, -2.78637530e-01, -5.08674811e-01,\n",
1381
       "         6.64688514e-01, -4.37367788e-01, -2.12402217e-01,\n",
1382
       "         1.47896467e-01, -7.19789255e-01,  2.09123966e-02,\n",
1383
       "         3.45001892e-01, -5.25706903e-01, -2.19421558e-01,\n",
1384
       "         7.03518991e-01,  1.15699818e-02,  6.00934480e-01,\n",
1385
       "         2.57988306e-02,  1.51172633e-08],\n",
1386
       "       [-4.91276964e-03,  1.31173112e-01,  3.17359256e-01,\n",
1387
       "         4.06477580e-01,  4.04451304e-01,  1.43739344e-01,\n",
1388
       "         1.22726570e+00,  1.35997095e-02,  1.26434485e+00,\n",
1389
       "         1.19238469e+00,  4.22549226e-01,  5.73747853e-01,\n",
1390
       "         2.36119744e-01,  7.06587455e-01,  6.39862879e-01,\n",
1391
       "         4.63957923e-01, -2.94664117e-06]])"
1392
      ]
1393
     },
1394
     "execution_count": 13,
1395
     "metadata": {},
1396
     "output_type": "execute_result"
1397
    }
1398
   ],
1399
   "source": [
1400
    "## Set up the regression\n",
1401
    "\n",
1402
    "mul_lr = LogisticRegression(multi_class='multinomial',solver ='newton-cg').fit(X_train,y_train)\n",
1403
    "\n",
1404
    "## lets get the results\n",
1405
    "mul_lr.intercept_\n",
1406
    "mul_lr.coef_"
1407
   ]
1408
  },
1409
  {
1410
   "cell_type": "code",
1411
   "execution_count": 14,
1412
   "metadata": {},
1413
   "outputs": [
1414
    {
1415
     "name": "stdout",
1416
     "output_type": "stream",
1417
     "text": [
1418
      "Warning: Maximum number of iterations has been exceeded.\n",
1419
      "         Current function value: 0.097334\n",
1420
      "         Iterations: 35\n",
1421
      "         Function evaluations: 38\n",
1422
      "         Gradient evaluations: 38\n",
1423
      "                          MNLogit Regression Results                          \n",
1424
      "==============================================================================\n",
1425
      "Dep. Variable:                  level   No. Observations:                  700\n",
1426
      "Model:                        MNLogit   Df Residuals:                      666\n",
1427
      "Method:                           MLE   Df Model:                           32\n",
1428
      "Date:                Tue, 04 Dec 2018   Pseudo R-squ.:                  0.9112\n",
1429
      "Time:                        20:33:12   Log-Likelihood:                -68.134\n",
1430
      "converged:                      False   LL-Null:                       -767.63\n",
1431
      "                                        LLR p-value:                5.963e-274\n",
1432
      "============================================================================================\n",
1433
      "                 level=2       coef    std err          z      P>|z|      [0.025      0.975]\n",
1434
      "--------------------------------------------------------------------------------------------\n",
1435
      "age                         -0.1031      0.060     -1.728      0.084      -0.220       0.014\n",
1436
      "gender                     -10.9784      3.356     -3.271      0.001     -17.556      -4.400\n",
1437
      "air_pollution               -2.6913      1.131     -2.379      0.017      -4.908      -0.474\n",
1438
      "genetic_risk                 5.0468      1.404      3.594      0.000       2.295       7.799\n",
1439
      "chronic_lung_disease        -2.7845      0.910     -3.059      0.002      -4.569      -1.000\n",
1440
      "smoking                     -0.9366      0.569     -1.647      0.099      -2.051       0.178\n",
1441
      "passive_smoker               1.4782      0.672      2.200      0.028       0.162       2.795\n",
1442
      "chest_pain                  -0.8705      0.682     -1.276      0.202      -2.208       0.467\n",
1443
      "coughing_of_blood            0.0447      0.545      0.082      0.935      -1.024       1.113\n",
1444
      "fatigue                      5.8949      1.561      3.775      0.000       2.835       8.955\n",
1445
      "weight_loss                 -1.6336      0.832     -1.964      0.050      -3.264      -0.003\n",
1446
      "shortness_of_breath         -0.0884      0.551     -0.160      0.873      -1.169       0.992\n",
1447
      "wheezing                     0.6777      0.877      0.773      0.440      -1.041       2.397\n",
1448
      "swallowing_difficulty        0.5567      0.668      0.834      0.405      -0.752       1.866\n",
1449
      "clubbing_of_finger_nails     3.7198      1.152      3.229      0.001       1.462       5.978\n",
1450
      "dry_cough                   -0.6692      0.579     -1.156      0.248      -1.804       0.465\n",
1451
      "intercept                   -5.1110      2.439     -2.096      0.036      -9.891      -0.331\n",
1452
      "--------------------------------------------------------------------------------------------\n",
1453
      "                 level=3       coef    std err          z      P>|z|      [0.025      0.975]\n",
1454
      "--------------------------------------------------------------------------------------------\n",
1455
      "age                         -1.0151      0.427     -2.379      0.017      -1.851      -0.179\n",
1456
      "gender                     -12.5765      3.659     -3.437      0.001     -19.749      -5.404\n",
1457
      "air_pollution                1.4011      1.765      0.794      0.427      -2.059       4.861\n",
1458
      "genetic_risk                 4.5602      4.878      0.935      0.350      -5.000      14.121\n",
1459
      "chronic_lung_disease         0.0066      3.619      0.002      0.999      -7.087       7.100\n",
1460
      "smoking                     -3.5495      1.631     -2.177      0.030      -6.746      -0.353\n",
1461
      "passive_smoker              11.2581      3.068      3.669      0.000       5.245      17.271\n",
1462
      "chest_pain                  -4.3841      3.281     -1.336      0.182     -10.815       2.047\n",
1463
      "coughing_of_blood            1.3247      3.298      0.402      0.688      -5.140       7.789\n",
1464
      "fatigue                      7.5920      1.937      3.919      0.000       3.795      11.389\n",
1465
      "weight_loss                 -1.4729      2.535     -0.581      0.561      -6.441       3.495\n",
1466
      "shortness_of_breath          3.0151      1.000      3.016      0.003       1.056       4.975\n",
1467
      "wheezing                    -3.7620      1.886     -1.994      0.046      -7.459      -0.065\n",
1468
      "swallowing_difficulty        0.1687      0.817      0.206      0.836      -1.433       1.771\n",
1469
      "clubbing_of_finger_nails     0.6814      2.566      0.266      0.791      -4.348       5.710\n",
1470
      "dry_cough                   -1.6649      1.083     -1.537      0.124      -3.788       0.458\n",
1471
      "intercept                  -10.5903      3.496     -3.029      0.002     -17.442      -3.738\n",
1472
      "============================================================================================\n"
1473
     ]
1474
    },
1475
    {
1476
     "name": "stderr",
1477
     "output_type": "stream",
1478
     "text": [
1479
      "C:\\Users\\Lina\\Anaconda3\\lib\\site-packages\\statsmodels\\base\\model.py:508: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
1480
      "  \"Check mle_retvals\", ConvergenceWarning)\n"
1481
     ]
1482
    }
1483
   ],
1484
   "source": [
1485
    "## Set up the regression\n",
1486
    "\n",
1487
    "logit = sm.MNLogit(y_train, X_train)\n",
1488
    "logit_result = logit.fit(method='bfgs')\n",
1489
    "\n",
1490
    "## lets get the results\n",
1491
    "print(logit_result.summary())"
1492
   ]
1493
  },
1494
  {
1495
   "cell_type": "code",
1496
   "execution_count": 15,
1497
   "metadata": {},
1498
   "outputs": [
1499
    {
1500
     "name": "stdout",
1501
     "output_type": "stream",
1502
     "text": [
1503
      "Coeffieients\n",
1504
      "                                  0          1\n",
1505
      "age                       -0.103143  -1.015068\n",
1506
      "gender                   -10.978373 -12.576457\n",
1507
      "air_pollution             -2.691276   1.401134\n",
1508
      "genetic_risk               5.046776   4.560153\n",
1509
      "chronic_lung_disease      -2.784542   0.006603\n",
1510
      "smoking                   -0.936573  -3.549460\n",
1511
      "passive_smoker             1.478245  11.258127\n",
1512
      "chest_pain                -0.870475  -4.384082\n",
1513
      "coughing_of_blood          0.044676   1.324661\n",
1514
      "fatigue                    5.894866   7.592025\n",
1515
      "weight_loss               -1.633568  -1.472910\n",
1516
      "shortness_of_breath       -0.088386   3.015135\n",
1517
      "wheezing                   0.677735  -3.762030\n",
1518
      "swallowing_difficulty      0.556677   0.168725\n",
1519
      "clubbing_of_finger_nails   3.719845   0.681413\n",
1520
      "dry_cough                 -0.669196  -1.664935\n",
1521
      "intercept                 -5.111022 -10.590282\n",
1522
      "\n",
1523
      "\n",
1524
      "p-Values\n",
1525
      "                                 0         1\n",
1526
      "age                       0.083975  0.017339\n",
1527
      "gender                    0.001071  0.000589\n",
1528
      "air_pollution             0.017339  0.427364\n",
1529
      "genetic_risk              0.000325  0.349867\n",
1530
      "chronic_lung_disease      0.002224  0.998544\n",
1531
      "smoking                   0.099470  0.029504\n",
1532
      "passive_smoker            0.027775  0.000243\n",
1533
      "chest_pain                0.201996  0.181521\n",
1534
      "coughing_of_blood         0.934670  0.687977\n",
1535
      "fatigue                   0.000160  0.000089\n",
1536
      "weight_loss               0.049567  0.561152\n",
1537
      "shortness_of_breath       0.872586  0.002563\n",
1538
      "wheezing                  0.439681  0.046131\n",
1539
      "swallowing_difficulty     0.404531  0.836471\n",
1540
      "clubbing_of_finger_nails  0.001242  0.790572\n",
1541
      "dry_cough                 0.247558  0.124328\n",
1542
      "intercept                 0.036120  0.002451\n",
1543
      "\n",
1544
      "\n",
1545
      "Dependent variables\n",
1546
      "level\n"
1547
     ]
1548
    }
1549
   ],
1550
   "source": [
1551
    "print(\"Coeffieients\")\n",
1552
    "print(logit_result.params)\n",
1553
    "print (\"\\n\")\n",
1554
    "print(\"p-Values\")\n",
1555
    "print(logit_result.pvalues)\n",
1556
    "print (\"\\n\")\n",
1557
    "print(\"Dependent variables\")\n",
1558
    "print(logit.endog_names)"
1559
   ]
1560
  },
1561
  {
1562
   "cell_type": "markdown",
1563
   "metadata": {},
1564
   "source": [
1565
    "## Interpreting logistic regression coefficients.\n",
1566
    "In this case, using the odds ratio will help us understand how 1 unit of increase or decrease in any of the variables affects the odds of being admitted."
1567
   ]
1568
  },
1569
  {
1570
   "cell_type": "code",
1571
   "execution_count": 57,
1572
   "metadata": {},
1573
   "outputs": [
1574
    {
1575
     "name": "stdout",
1576
     "output_type": "stream",
1577
     "text": [
1578
      "                                   0             1\n",
1579
      "age                         0.901998      0.362378\n",
1580
      "gender                      0.000017      0.000003\n",
1581
      "air_pollution               0.067794      4.059801\n",
1582
      "genetic_risk              155.520200     95.598101\n",
1583
      "chronic_lung_disease        0.061757      1.006625\n",
1584
      "smoking                     0.391969      0.028740\n",
1585
      "passive_smoker              4.385243  77507.296804\n",
1586
      "chest_pain                  0.418752      0.012474\n",
1587
      "coughing_of_blood           1.045689      3.760909\n",
1588
      "fatigue                   363.168194   1982.323098\n",
1589
      "weight_loss                 0.195232      0.229257\n",
1590
      "shortness_of_breath         0.915408     20.391840\n",
1591
      "wheezing                    1.969412      0.023237\n",
1592
      "swallowing_difficulty       1.744865      1.183795\n",
1593
      "clubbing_of_finger_nails   41.257983      1.976668\n",
1594
      "dry_cough                   0.512120      0.189203\n",
1595
      "intercept                   0.006030      0.000025\n"
1596
     ]
1597
    }
1598
   ],
1599
   "source": [
1600
    "print (np.exp(logit_result.params))"
1601
   ]
1602
  },
1603
  {
1604
   "cell_type": "markdown",
1605
   "metadata": {},
1606
   "source": [
1607
    "These values are from our train set, now lets predict on our test set"
1608
   ]
1609
  },
1610
  {
1611
   "cell_type": "markdown",
1612
   "metadata": {},
1613
   "source": [
1614
    "<span style=\"color:red\">Please explain more about the coefficients and p-values and what they mean, e.g. which features are most important? which has a higher influence on each level based on coefficients?</span>"
1615
   ]
1616
  },
1617
  {
1618
   "cell_type": "markdown",
1619
   "metadata": {},
1620
   "source": [
1621
    "## Predicting and Evaluating\n",
1622
    "If we call the predict method, we will get the predictive probabilities. But to make a prediction if a patient has a Low, Medium, High Lung cancer we must convert these predicted probabilities into class labels 0=Low or 1 = Medium or 2=High. "
1623
   ]
1624
  },
1625
  {
1626
   "cell_type": "code",
1627
   "execution_count": 58,
1628
   "metadata": {},
1629
   "outputs": [
1630
    {
1631
     "name": "stdout",
1632
     "output_type": "stream",
1633
     "text": [
1634
      "                0             1             2\n",
1635
      "204  9.739892e-01  2.601079e-02  5.988912e-25\n",
1636
      "71   9.941086e-01  3.641338e-12  5.891434e-03\n",
1637
      "594  4.357147e-09  1.000000e+00  1.202880e-27\n",
1638
      "672  9.685383e-01  8.177787e-05  3.137988e-02\n",
1639
      "14   3.378482e-20  6.732186e-23  1.000000e+00\n",
1640
      "64   9.966876e-01  3.312377e-03  1.929552e-26\n",
1641
      "340  1.792525e-06  9.999982e-01  2.440023e-22\n",
1642
      "135  2.908870e-06  3.631127e-06  9.999935e-01\n",
1643
      "350  1.321207e-03  9.986788e-01  1.271600e-26\n",
1644
      "976  6.893179e-18  2.766205e-23  1.000000e+00\n"
1645
     ]
1646
    }
1647
   ],
1648
   "source": [
1649
    "## Here we have the predictive probabilities\n",
1650
    "predictions = logit_result.predict(X_test)\n",
1651
    "print(predictions[:10])"
1652
   ]
1653
  },
1654
  {
1655
   "cell_type": "code",
1656
   "execution_count": 60,
1657
   "metadata": {},
1658
   "outputs": [
1659
    {
1660
     "data": {
1661
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ8AAAEJCAYAAABL3SrKAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAFmZJREFUeJzt3H9MVff9x/EXA01punKVwQVX1I2iBZ2rulwlVq2wsui+1dq0U2z3dVYrBuxsNn9gY1tJ2yG71tmBUiulmxsu04mFFu2yVpg/gLqsGKyGSpy10gJ36oiB4NDK94+G+y3lAvdeuB+49PlISDyf+/nc877v3NyX59xzbkBTU1O7AAAw6BsDXQAA4OuH8AEAGEf4AACMI3wAAMYRPgAA4wgfAIBxhA8AwDjCBwBgnF+HT21t7UCXMOjQE9foi2v0pSt64lp/98WvwwcA4J8IHwCAcYQPAMA4wgcAYFyv4bNt2zbNmTNHUVFRio6O1qJFi3T27Nlen/jMmTOaN2+eIiIiFBsbq6ysLLW38wPaAAA3wuf48eNavny5/vrXv6q4uFhBQUF66KGH9J///KfbNdeuXdPChQsVHh6uI0eOaMuWLcrOzlZOTk6/Fg8A8E9BvU0oLCzstL1r1y6NHj1alZWVmjt3rss1+/fvV2trq3JzcxUcHKy4uDidO3dOO3fu1OrVqxUQENA/1QMA/JLH3/k0Nzfr1q1bslgs3c45efKk4uPjFRwc7BxLTExUfX29Ll686F2lAIAho9cjn69KT0/X9773Pdlstm7nOBwOjRo1qtNYWFiY87GxY8e6XOfNTUwdaw79sEiXM04rfXSjrL+/rqD7C2S/eUifDDvuHHup7H90+6YQ59hnUdk69MMi59hvSi8q6P4CZWz+k27fFKIl0Qecz5Wx+U+dnr9q026PazWFm+Rcoy+u0Zeu6IlrnvYlJiam28c8Cp9nnnlGlZWVeueddxQYGNjj3K+eWuu42KCnU249FepKbW2tx2v6sr/+WutLfe3JUEVfXKMvXdET1/q7L26Hz8aNG1VYWKi33nqr2yOXDuHh4XI4HJ3GLl++LOn/j4AAAF9fbn3ns2HDBv3lL39RcXGxxo0b1+t8m82miooKXb9+3TlWWlqqyMhIjRkzxvtqAQBDQq/hs3btWu3du1d5eXmyWCxqbGxUY2OjmpubnXMyMjI0f/585/Yjjzyi4OBgpaam6uzZsyouLtb27duVmprKlW4AgN5Pu+Xl5UmSFixY0Gl8w4YN2rhxoySpoaFBFy5ccD4WEhKigwcPau3atZozZ44sFovS0tK0evXq/qwdAOCneg2fpqamXp8kNze3y9iECRN0+PBh76oCAAxp/LYbAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABjnVvicOHFCixcvVmxsrCwWiwoKCnqcf/HiRVksli5/7777br8UDQDwb0HuTGppaVFcXJySk5O1atUqt5/8wIEDmjhxonN7xIgRnlcIABhy3AqfpKQkJSUlSZJSU1PdfvKRI0fKarV6VxkAYMjy6Xc+P/3pT3X33XfrRz/6kYqKiny5KwCAH3HryMdTd9xxh1544QVNnz5dQUFBOnTokJYtW6bc3FwtWrTIF7sEAPgRn4RPaGionnrqKef25MmTdfXqVb3yyis9hk9tba3H+/JmzUCv9bXBXNtAoi+u0Zeu6IlrnvYlJiam28d8Ej6uTJ06tder5Hoq1JXa2lqP1/Rlf/211pf62pOhir64Rl+6oieu9XdfjN3nc/r0aS4+AABIcvPIp7m5Wf/6178kSbdu3VJdXZ2qq6s1YsQIRUVFKSMjQ//85z9VXFwsSdq7d6+GDRumSZMm6Rvf+Ibeeecd5eXlafPmzT57IQAA/+FW+FRVVenBBx90bmdmZiozM1PJycnKzc1VQ0ODLly40GnN1q1bdenSJQUGBio6Olo5OTlcbAAAkORm+MycOVNNTU3dPp6bm9tpe8mSJVqyZEnfKgMADFn8thsAwDjCBwBgHOEDADCO8AEAGEf4AACMI3wAAMYRPgAA4wgfAIBxhA8AwDjCBwBgHOEDADCO8AEAGEf4AACMI3wAAMYRPgAA4wgfAIBxhA8AwDjCBwBgHOEDADCO8AEAGEf4AACMI3wAAMYRPgAA4wgfAIBxhA8AwDjCBwBgHOEDADCO8AEAGEf4AACMI3wAAMYRPgAA4wgfAIBxhA8AwDjCBwBgHOEDADCO8AEAGEf4AACMI3wAAMYRPgAA4wgfAIBxhA8AwDjCBwBgHOEDADCO8AEAGEf4AACMcyt8Tpw4ocWLFys2NlYWi0UFBQW9rjlz5ozmzZuniIgIxcbGKisrS+3t7X0uGADg/9wKn5aWFsXFxWnLli0KDg7udf61a9e0cOFChYeH68iRI9qyZYuys7OVk5PT54IBAP4vyJ1JSUlJSkpKkiSlpqb2On///v1qbW1Vbm6ugoODFRcXp3Pnzmnnzp1avXq1AgIC+lY1AMCv+eQ7n5MnTyo+Pr7TUVJiYqLq6+t18eJFX+wSAOBH3Dry8ZTD4dCoUaM6jYWFhTkfGzt2rMt1tbW1Hu/LmzUdLG98qgwv13qz30M/LNK8dxd4uUf39aUnQxl9ca23vkx+8UlVbdptqJrBYbC+Vya/+KSC7i+Q/eYhfTLsuNJHN8r6++uq2rRbZWVl+mTYcf3vjE2Svvi8uX1TiD4Zdly/Kb2ooPsLlLH5T7p9U4iWRB+Q9ffXexxz9VnlaV9iYmK6fcwn4SOpy6m1josNejrl1lOhrtTW1nq8pr94u19f1zuQPRnM6Itr7vbl69Q7f3yvxMTEqKyszPnv/nrOL+vvvvjktFt4eLgcDkenscuXL0v6/yMgAMDXl0/Cx2azqaKiQtevX3eOlZaWKjIyUmPGjPHFLgEAfsSt8GlublZ1dbWqq6t169Yt1dXVqbq6WpcuXZIkZWRkaP78+c75jzzyiIKDg5WamqqzZ8+quLhY27dvV2pqKle6AQDcC5+qqirNmjVLs2bNUmtrqzIzMzVr1iz96le/kiQ1NDTowoULzvkhISE6ePCg6uvrNWfOHK1bt05paWlavXq1b14FAMCvuHXBwcyZM9XU1NTt47m5uV3GJkyYoMOHD3tfGQBgyOK33QAAxhE+AADjCB8AgHGEDwDAOMIHAGAc4QMAMI7wAQAYR/gAAIwjfAAAxhE+AADjCB8AgHGEDwDAOMIHAGAc4QMAMI7wAQAYR/gAAIwjfAAAxhE+AADjCB8AgHGEDwDAOMIHAGAc4QMAMI7wAQAYR/gAAIwjfAAAxhE+AADjCB8AgHGEDwDAOMIHAGAc4QMAMI7wAQAYR/gAAIwjfAAAxhE+AADjCB8AgHGEDwDAOMIHAGAc4QMAMI7wAQAYR/gAAIwjfAAAxhE+AADjCB8AgHGEDwDAOLfDJy8vT5MmTZLVatXs2bNVXl7e7dxjx47JYrF0+Tt37ly/FA0A8G9B7kwqLCxUenq6Xn75ZU2fPl15eXl69NFHVVlZqaioqG7XVVZWasSIEc7tb33rW32vGADg99w68tmxY4eWLFmipUuXavz48bLb7bJarcrPz+9xXVhYmKxWq/MvMDCwX4oGAPi3XsOnra1Np06dUkJCQqfxhIQEvf/++z2uvf/++zV+/HjNnz9fR48e7VulAIAho9fTbleuXNHnn3+usLCwTuNhYWFyOBwu10RERGjbtm2aMmWK2tra9Oc//1kLFizQ22+/rRkzZvRP5QAAv+XWdz6SFBAQ0Gm7vb29y1iHmJgYxcTEOLdtNps++eQTZWdn9xg+tbW17pbTpzX9wdv9mqh3oHoy2NEX13rry2Q35gw1g/X1Tu5m/Mv19lftrp7H0+f+cg58Va/hExoaqsDAwC5HOZcvX+5yNNSTqVOnqrCwsMc5PRXqSm1trcdr+ou3+/V1vQPZk8GMvrjmbl++Tr3zx/dKTEyMysrKnP/ur+f8sv7uS6/f+QwfPlz33nuvSktLO42XlpZq2rRpbu/o9OnTslqtnlcIABhy3DrtlpaWppSUFE2dOlXTpk1Tfn6+GhoatGzZMklSSkqKJGnXrl2SpJ07d2r06NGKjY1VW1ub9u3bp5KSEu3Zs8dHLwMA4E/cCp+HH35YV69eld1uV2Njo2JjY7Vv3z6NHj1aklRXV9dp/o0bN/Tss8+qvr5et912m3N+UlJS/78CAIDfcfuCgxUrVmjFihUuHyspKem0vWbNGq1Zs6ZvlQEAhix+2w0AYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGET4AAOMIHwCAcYQPAMA4wgcAYBzhAwAwjvABABhH+AAAjCN8AADGuR0+eXl5mjRpkqxWq2bPnq3y8vIe5x8/flyzZ8+W1WrV97//feXn5/e5WADA0OBW+BQWFio9PV2//OUvdfToUdlsNj366KO6dOmSy/kff/yxfvKTn8hms+no0aP6xS9+ofXr16uoqKhfiwcA+Ce3wmfHjh1asmSJli5dqvHjx8tut8tqtXZ7NPPGG28oIiJCdrtd48eP19KlS5WcnKycnJx+LR4A4J8Cmpqa2nua0NbWpsjISL3++ut66KGHnONr167V2bNndejQoS5r5s6dqwkTJmjr1q3OsTfffFMrVqxQfX29hg0b1o8vAQDgb3o98rly5Yo+//xzhYWFdRoPCwuTw+FwucbhcLicf/PmTV25cqUP5QIAhgK3LzgICAjotN3e3t5lrLf5rsYBAF8/vYZPaGioAgMDuxzlXL58ucvRTYfw8HCX84OCgjRy5Mg+lAsAGAp6DZ/hw4fr3nvvVWlpaafx0tJSTZs2zeUam82msrKyLvMnT57M9z0AAPdOu6WlpWnv3r3as2ePPvroI23YsEENDQ1atmyZJCklJUUpKSnO+cuWLdNnn32m9PR0ffTRR9qzZ4/27t2r1atX++ZVAAD8ilvh8/DDDyszM1N2u10zZ85UZWWl9u3bp9GjR0uS6urqVFdX55w/duxY7du3T+Xl5Zo5c6a2bt2qrKwsLViwwO3CuKnVNU/6UlxcrIULFyo6Olp33XWXEhMTXV6dOBR4+n7pUFFRodDQUMXHx/u4woHhaV/a2tr00ksvadKkSQoPD9fEiRP16quvGqrWDE97sn//ft13332KjIzUuHHjtHLlSjU2Nhqq1owTJ05o8eLFio2NlcViUUFBQa9rzpw5o3nz5ikiIkKxsbHKyspyfrfvDrcvOFixYoVOnz4th8Ohv//975oxY4bzsZKSEpWUlHSaf9999+no0aNyOByqrq7WE0884XZR3NTqmqd9OXHihGbNmqV9+/bp6NGjeuCBB/T444+7/cHsLzztS4empiatWrVKs2fPNlSpWd70Zfny5Xrvvff0yiuv6B//+Id+97vfacKECQar9i1Pe1JZWamUlBQlJyeroqJCBQUFqqmp0ZNPPmm4ct9qaWlRXFyctmzZouDg4F7nX7t2TQsXLlR4eLiOHDmiLVu2KDs726N7OXu9z2cgJCYmasKECfrtb3/rHJsyZYoWLFig559/vsv8559/Xm+99ZY++OAD59hTTz2lmpoa/e1vfzNSswme9sWVhIQExcfH66WXXvJVmcZ525fHH39cEydOVHt7u4qLi1VRUWGiXGM87cuRI0f0s5/9TFVVVQoNDTVZqjGe9iQ7O1u7du3Shx9+6Bz74x//qA0bNujTTz81UrNp3/72t/XrX/9ajz32WLdzXn/9dW3evFnnzp1zhpXdbld+fr7Onj3r1lXNg+6HRdva2nTq1CklJCR0Gk9ISND777/vcs3Jkye7zE9MTFRVVZVu3Ljhs1pN8qYvrjQ3N8tisfR3eQPG277k5eXJ4XBo3bp1vi5xQHjTl5KSEk2ePFk7duxQXFycpkyZovXr16u5udlEyT7nTU+mTZumxsZGHT58WO3t7bpy5YoKCwv1wAMPmCh50Dp58qTi4+M7HSUlJiaqvr5eFy9edOs5Bl34cFOra9705at2796tzz77TIsWLfJFiQPCm76cOXNGWVlZeu211xQYGGiiTOO86cvHH3+syspKffjhh9qzZ4/sdrvee+89paammijZ57zpic1mU15enlauXKmwsDBFR0ervb1dubm5JkoetLr7zO14zB2DLnw6cFOra572pUNRUZGee+45vfbaa84LRYYSd/vy3//+V8uXL9cLL7ygsWPHGqpu4Hjyfrl165YCAgK0e/du/eAHP1BiYqLsdruKi4vd/kDxB570pKamRunp6Vq3bp3Kysp04MABNTY26umnnzZR6qDW18/coH6vqI+4qdU1b/rSoaioSKtWrdKrr76qefPm+bJM4zztS0NDg2pqapSWlqa0tDRJX3zotre3KzQ0VPv37+9yWsYfefN+sVqtioyMVEhIiHNs3Lhxkr64ojU8PNx3BRvgTU+2bdumKVOm6Oc//7kkaeLEibr99ts1d+5cPfvss7rrrrt8Xvdg1N1nrqReP486DLojH25qdc2bvkjSwYMHlZKSop07d3p0qbu/8LQvo0aNUnl5uY4dO+b8e+KJJ/Td735Xx44dk81mM1W6T3nzfpk+fboaGho6fcdz/vx5SVJUVJTvijXEm560trZ2OTXbse3JZcVDjc1mU0VFha5fv+4cKy0tVWRkpMaMGePWcwSmp6dv9lF9XvvmN7+pzMxMRURE6LbbbpPdbld5eblycnIUEhKilJQUvf3223rwwQclSd/5zne0fft2/fvf/1ZUVJQOHTqkl19+WS+++KLuueeeAX41/cfTvhw4cEArV65URkaGkpKS1NLSopaWFt24ccOtyyn9hSd9CQwMVFhYWKe/Dz74QOfPn9fGjRs1fPjwgX45/cbT98vdd9+tgoICnTp1Svfcc4/Onz+vdevWacaMGT1e+eRPPO1Ja2ursrOzFRoaqpEjRzpPw1mtVq1Zs2aAX03/aW5uVk1NjRobG/WHP/xBcXFxuvPOO9XW1qaQkBBlZGRo27ZtSk5OliRFR0frjTfe0OnTpxUTE6OKigo999xzevrpp3v8z/CXDbrTbtIXN7VevXpVdrtdjY2Nio2N7XJT65d13NT6zDPPKD8/XxERER7f1OoPPO1Lfn6+bt68qY0bN2rjxo3O8RkzZnS5L8ufedqXrwtP+3LHHXfozTff1Pr165WQkCCLxaIf//jHbl/G7w887cljjz2m5uZm7d69W5s2bdKdd96pmTNnKiMjYyDK95mqqipn4EpSZmamMjMzlZycrNzcXDU0NOjChQvOx0NCQnTw4EGtXbtWc+bMkcViUVpamke/YjMo7/MBAAxtg+47HwDA0Ef4AACMI3wAAMYRPgAA4wgfAIBxhA8AwDjCBwBgHOEDADCO8AEAGPd/L8rwPGVvLckAAAAASUVORK5CYII=\n",
1662
      "text/plain": [
1663
       "<Figure size 432x288 with 1 Axes>"
1664
      ]
1665
     },
1666
     "metadata": {},
1667
     "output_type": "display_data"
1668
    }
1669
   ],
1670
   "source": [
1671
    "plt.hist(predictions);"
1672
   ]
1673
  },
1674
  {
1675
   "cell_type": "markdown",
1676
   "metadata": {},
1677
   "source": [
1678
    "### Confusion matrix and Classification report\n",
1679
    "---"
1680
   ]
1681
  },
1682
  {
1683
   "cell_type": "code",
1684
   "execution_count": 61,
1685
   "metadata": {},
1686
   "outputs": [
1687
    {
1688
     "data": {
1689
      "text/plain": [
1690
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
1691
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
1692
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
1693
       "          verbose=0, warm_start=False)"
1694
      ]
1695
     },
1696
     "execution_count": 61,
1697
     "metadata": {},
1698
     "output_type": "execute_result"
1699
    }
1700
   ],
1701
   "source": [
1702
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
1703
    "logreg = LogisticRegression()\n",
1704
    "logreg.fit(X_train, y_train)"
1705
   ]
1706
  },
1707
  {
1708
   "cell_type": "code",
1709
   "execution_count": 62,
1710
   "metadata": {},
1711
   "outputs": [
1712
    {
1713
     "name": "stdout",
1714
     "output_type": "stream",
1715
     "text": [
1716
      "Accuracy of logistic regression classifier on test set: 0.97\n"
1717
     ]
1718
    }
1719
   ],
1720
   "source": [
1721
    "y_pred = logreg.predict(X_test)\n",
1722
    "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))"
1723
   ]
1724
  },
1725
  {
1726
   "cell_type": "code",
1727
   "execution_count": 63,
1728
   "metadata": {},
1729
   "outputs": [
1730
    {
1731
     "name": "stdout",
1732
     "output_type": "stream",
1733
     "text": [
1734
      "length of oversampled data is  720\n",
1735
      "Number of no subscription in oversampled data 0\n",
1736
      "Number of subscription 240\n",
1737
      "Proportion of no subscription data in oversampled data is  0.0\n",
1738
      "Proportion of subscription data in oversampled data is  0.3333333333333333\n"
1739
     ]
1740
    }
1741
   ],
1742
   "source": [
1743
    "from imblearn.over_sampling import SMOTE\n",
1744
    "columns = X_train.columns\n",
1745
    "os = SMOTE(random_state=0)\n",
1746
    "os_data_X,os_data_y=os.fit_sample(X_train, y_train)\n",
1747
    "os_data_X = pd.DataFrame(data=os_data_X,columns=columns )\n",
1748
    "os_data_y= pd.DataFrame(data=os_data_y,columns=['y'])\n",
1749
    "# we can Check the numbers of our data\n",
1750
    "print(\"length of oversampled data is \",len(os_data_X))\n",
1751
    "print(\"Number of no subscription in oversampled data\",len(os_data_y[os_data_y['y']==0]))\n",
1752
    "print(\"Number of subscription\",len(os_data_y[os_data_y['y']==1]))\n",
1753
    "print(\"Proportion of no subscription data in oversampled data is \",len(os_data_y[os_data_y['y']==0])/len(os_data_X))\n",
1754
    "print(\"Proportion of subscription data in oversampled data is \",len(os_data_y[os_data_y['y']==1])/len(os_data_X))"
1755
   ]
1756
  },
1757
  {
1758
   "cell_type": "code",
1759
   "execution_count": 64,
1760
   "metadata": {},
1761
   "outputs": [
1762
    {
1763
     "name": "stdout",
1764
     "output_type": "stream",
1765
     "text": [
1766
      "[ True  True  True  True  True  True  True  True  True  True  True  True\n",
1767
      "  True  True  True  True  True]\n",
1768
      "[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n"
1769
     ]
1770
    }
1771
   ],
1772
   "source": [
1773
    "from sklearn.feature_selection import RFE\n",
1774
    "logreg = LogisticRegression()\n",
1775
    "rfe = RFE(logreg, 20)\n",
1776
    "rfe = rfe.fit(os_data_X, os_data_y.values.ravel())\n",
1777
    "print(rfe.support_)\n",
1778
    "print(rfe.ranking_)"
1779
   ]
1780
  },
1781
  {
1782
   "cell_type": "code",
1783
   "execution_count": 65,
1784
   "metadata": {},
1785
   "outputs": [
1786
    {
1787
     "name": "stdout",
1788
     "output_type": "stream",
1789
     "text": [
1790
      "[[ 76   0   0]\n",
1791
      " [  4  83   5]\n",
1792
      " [  0   0 132]]\n"
1793
     ]
1794
    },
1795
    {
1796
     "data": {
1797
      "text/html": [
1798
       "<div>\n",
1799
       "<style scoped>\n",
1800
       "    .dataframe tbody tr th:only-of-type {\n",
1801
       "        vertical-align: middle;\n",
1802
       "    }\n",
1803
       "\n",
1804
       "    .dataframe tbody tr th {\n",
1805
       "        vertical-align: top;\n",
1806
       "    }\n",
1807
       "\n",
1808
       "    .dataframe thead th {\n",
1809
       "        text-align: right;\n",
1810
       "    }\n",
1811
       "</style>\n",
1812
       "<table border=\"1\" class=\"dataframe\">\n",
1813
       "  <thead>\n",
1814
       "    <tr style=\"text-align: right;\">\n",
1815
       "      <th></th>\n",
1816
       "      <th>Predict_Label_0 Low</th>\n",
1817
       "      <th>Predict_Label_1 Medium</th>\n",
1818
       "      <th>Predict_Label_2 High</th>\n",
1819
       "    </tr>\n",
1820
       "  </thead>\n",
1821
       "  <tbody>\n",
1822
       "    <tr>\n",
1823
       "      <th>True_Label_0 Low</th>\n",
1824
       "      <td>76</td>\n",
1825
       "      <td>0</td>\n",
1826
       "      <td>0</td>\n",
1827
       "    </tr>\n",
1828
       "    <tr>\n",
1829
       "      <th>True_Label_1 Medium</th>\n",
1830
       "      <td>4</td>\n",
1831
       "      <td>83</td>\n",
1832
       "      <td>5</td>\n",
1833
       "    </tr>\n",
1834
       "    <tr>\n",
1835
       "      <th>True_Label_2 High</th>\n",
1836
       "      <td>0</td>\n",
1837
       "      <td>0</td>\n",
1838
       "      <td>132</td>\n",
1839
       "    </tr>\n",
1840
       "  </tbody>\n",
1841
       "</table>\n",
1842
       "</div>"
1843
      ],
1844
      "text/plain": [
1845
       "                     Predict_Label_0 Low  Predict_Label_1 Medium  \\\n",
1846
       "True_Label_0 Low                      76                       0   \n",
1847
       "True_Label_1 Medium                    4                      83   \n",
1848
       "True_Label_2 High                      0                       0   \n",
1849
       "\n",
1850
       "                     Predict_Label_2 High  \n",
1851
       "True_Label_0 Low                        0  \n",
1852
       "True_Label_1 Medium                     5  \n",
1853
       "True_Label_2 High                     132  "
1854
      ]
1855
     },
1856
     "execution_count": 65,
1857
     "metadata": {},
1858
     "output_type": "execute_result"
1859
    }
1860
   ],
1861
   "source": [
1862
    "from sklearn.metrics import confusion_matrix\n",
1863
    "confusion_matrix = confusion_matrix(y_test, y_pred)\n",
1864
    "print(confusion_matrix)\n",
1865
    "confusion = pd.DataFrame(confusion_matrix,index=['True_Label_0 Low', 'True_Label_1 Medium','True_Label_2 High'],\n",
1866
    "                         columns=['Predict_Label_0 Low', 'Predict_Label_1 Medium','Predict_Label_2 High'])\n",
1867
    "\n",
1868
    "confusion"
1869
   ]
1870
  },
1871
  {
1872
   "cell_type": "code",
1873
   "execution_count": 66,
1874
   "metadata": {},
1875
   "outputs": [
1876
    {
1877
     "name": "stdout",
1878
     "output_type": "stream",
1879
     "text": [
1880
      "[[ 40   0   0   0   0   0   0]\n",
1881
      " [121  91   0   0   0   0   0]\n",
1882
      " [ 92  81   0   0   0   0   0]\n",
1883
      " [ 20  20   0   0   0   0   0]\n",
1884
      " [  0  20  80   0   0   0   0]\n",
1885
      " [ 20  20  68   0   0   0   0]\n",
1886
      " [ 10 100 217   0   0   0   0]]\n"
1887
     ]
1888
    },
1889
    {
1890
     "data": {
1891
      "text/html": [
1892
       "<div>\n",
1893
       "<style scoped>\n",
1894
       "    .dataframe tbody tr th:only-of-type {\n",
1895
       "        vertical-align: middle;\n",
1896
       "    }\n",
1897
       "\n",
1898
       "    .dataframe tbody tr th {\n",
1899
       "        vertical-align: top;\n",
1900
       "    }\n",
1901
       "\n",
1902
       "    .dataframe thead th {\n",
1903
       "        text-align: right;\n",
1904
       "    }\n",
1905
       "</style>\n",
1906
       "<table border=\"1\" class=\"dataframe\">\n",
1907
       "  <thead>\n",
1908
       "    <tr style=\"text-align: right;\">\n",
1909
       "      <th></th>\n",
1910
       "      <th>level Low</th>\n",
1911
       "      <th>level Medium</th>\n",
1912
       "      <th>level High</th>\n",
1913
       "      <th>Predected_Label_4</th>\n",
1914
       "      <th>Predected_Label_5</th>\n",
1915
       "      <th>Predected_Label_6</th>\n",
1916
       "      <th>Predected_Label_7</th>\n",
1917
       "    </tr>\n",
1918
       "  </thead>\n",
1919
       "  <tbody>\n",
1920
       "    <tr>\n",
1921
       "      <th>genetic_risk_level_1</th>\n",
1922
       "      <td>40</td>\n",
1923
       "      <td>0</td>\n",
1924
       "      <td>0</td>\n",
1925
       "      <td>0</td>\n",
1926
       "      <td>0</td>\n",
1927
       "      <td>0</td>\n",
1928
       "      <td>0</td>\n",
1929
       "    </tr>\n",
1930
       "    <tr>\n",
1931
       "      <th>genetic_risk_level_2</th>\n",
1932
       "      <td>121</td>\n",
1933
       "      <td>91</td>\n",
1934
       "      <td>0</td>\n",
1935
       "      <td>0</td>\n",
1936
       "      <td>0</td>\n",
1937
       "      <td>0</td>\n",
1938
       "      <td>0</td>\n",
1939
       "    </tr>\n",
1940
       "    <tr>\n",
1941
       "      <th>genetic_risk_level_3</th>\n",
1942
       "      <td>92</td>\n",
1943
       "      <td>81</td>\n",
1944
       "      <td>0</td>\n",
1945
       "      <td>0</td>\n",
1946
       "      <td>0</td>\n",
1947
       "      <td>0</td>\n",
1948
       "      <td>0</td>\n",
1949
       "    </tr>\n",
1950
       "    <tr>\n",
1951
       "      <th>genetic_risk_level_4</th>\n",
1952
       "      <td>20</td>\n",
1953
       "      <td>20</td>\n",
1954
       "      <td>0</td>\n",
1955
       "      <td>0</td>\n",
1956
       "      <td>0</td>\n",
1957
       "      <td>0</td>\n",
1958
       "      <td>0</td>\n",
1959
       "    </tr>\n",
1960
       "    <tr>\n",
1961
       "      <th>genetic_risk_level_5</th>\n",
1962
       "      <td>0</td>\n",
1963
       "      <td>20</td>\n",
1964
       "      <td>80</td>\n",
1965
       "      <td>0</td>\n",
1966
       "      <td>0</td>\n",
1967
       "      <td>0</td>\n",
1968
       "      <td>0</td>\n",
1969
       "    </tr>\n",
1970
       "    <tr>\n",
1971
       "      <th>genetic_risk_level_6</th>\n",
1972
       "      <td>20</td>\n",
1973
       "      <td>20</td>\n",
1974
       "      <td>68</td>\n",
1975
       "      <td>0</td>\n",
1976
       "      <td>0</td>\n",
1977
       "      <td>0</td>\n",
1978
       "      <td>0</td>\n",
1979
       "    </tr>\n",
1980
       "    <tr>\n",
1981
       "      <th>genetic_risk_level_7</th>\n",
1982
       "      <td>10</td>\n",
1983
       "      <td>100</td>\n",
1984
       "      <td>217</td>\n",
1985
       "      <td>0</td>\n",
1986
       "      <td>0</td>\n",
1987
       "      <td>0</td>\n",
1988
       "      <td>0</td>\n",
1989
       "    </tr>\n",
1990
       "  </tbody>\n",
1991
       "</table>\n",
1992
       "</div>"
1993
      ],
1994
      "text/plain": [
1995
       "                      level Low  level Medium  level High  Predected_Label_4  \\\n",
1996
       "genetic_risk_level_1         40             0           0                  0   \n",
1997
       "genetic_risk_level_2        121            91           0                  0   \n",
1998
       "genetic_risk_level_3         92            81           0                  0   \n",
1999
       "genetic_risk_level_4         20            20           0                  0   \n",
2000
       "genetic_risk_level_5          0            20          80                  0   \n",
2001
       "genetic_risk_level_6         20            20          68                  0   \n",
2002
       "genetic_risk_level_7         10           100         217                  0   \n",
2003
       "\n",
2004
       "                      Predected_Label_5  Predected_Label_6  Predected_Label_7  \n",
2005
       "genetic_risk_level_1                  0                  0                  0  \n",
2006
       "genetic_risk_level_2                  0                  0                  0  \n",
2007
       "genetic_risk_level_3                  0                  0                  0  \n",
2008
       "genetic_risk_level_4                  0                  0                  0  \n",
2009
       "genetic_risk_level_5                  0                  0                  0  \n",
2010
       "genetic_risk_level_6                  0                  0                  0  \n",
2011
       "genetic_risk_level_7                  0                  0                  0  "
2012
      ]
2013
     },
2014
     "execution_count": 66,
2015
     "metadata": {},
2016
     "output_type": "execute_result"
2017
    }
2018
   ],
2019
   "source": [
2020
    "from sklearn.metrics import confusion_matrix\n",
2021
    "confusion_matrix = confusion_matrix(LUNG_CANCER.genetic_risk, LUNG_CANCER.level)\n",
2022
    "print(confusion_matrix)\n",
2023
    "confusion = pd.DataFrame(confusion_matrix,index=['genetic_risk_level_1', 'genetic_risk_level_2','genetic_risk_level_3','genetic_risk_level_4', 'genetic_risk_level_5','genetic_risk_level_6','genetic_risk_level_7'],\n",
2024
    "                         columns=['level Low', 'level Medium','level High','Predected_Label_4', 'Predected_Label_5','Predected_Label_6','Predected_Label_7'])\n",
2025
    "\n",
2026
    "confusion"
2027
   ]
2028
  },
2029
  {
2030
   "cell_type": "code",
2031
   "execution_count": 32,
2032
   "metadata": {},
2033
   "outputs": [
2034
    {
2035
     "data": {
2036
      "text/plain": [
2037
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
2038
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
2039
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
2040
       "          verbose=0, warm_start=False)"
2041
      ]
2042
     },
2043
     "execution_count": 32,
2044
     "metadata": {},
2045
     "output_type": "execute_result"
2046
    }
2047
   ],
2048
   "source": [
2049
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)\n",
2050
    "logreg = LogisticRegression()\n",
2051
    "logreg.fit(X_train, y_train)"
2052
   ]
2053
  },
2054
  {
2055
   "cell_type": "code",
2056
   "execution_count": 33,
2057
   "metadata": {},
2058
   "outputs": [
2059
    {
2060
     "name": "stdout",
2061
     "output_type": "stream",
2062
     "text": [
2063
      "Accuracy of logistic regression classifier on test set: 0.97\n"
2064
     ]
2065
    }
2066
   ],
2067
   "source": [
2068
    "y_pred = logreg.predict(X_test)\n",
2069
    "print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))"
2070
   ]
2071
  },
2072
  {
2073
   "cell_type": "markdown",
2074
   "metadata": {},
2075
   "source": [
2076
    "## Compute precision, recall, F-measure and support"
2077
   ]
2078
  },
2079
  {
2080
   "cell_type": "code",
2081
   "execution_count": 79,
2082
   "metadata": {},
2083
   "outputs": [
2084
    {
2085
     "name": "stdout",
2086
     "output_type": "stream",
2087
     "text": [
2088
      "             precision    recall  f1-score   support\n",
2089
      "\n",
2090
      "          1      0.950     1.000     0.974        76\n",
2091
      "          2      1.000     0.902     0.949        92\n",
2092
      "          3      0.964     1.000     0.981       132\n",
2093
      "\n",
2094
      "avg / total      0.971     0.970     0.970       300\n",
2095
      "\n"
2096
     ]
2097
    }
2098
   ],
2099
   "source": [
2100
    "from sklearn.metrics import classification_report\n",
2101
    "print (classification_report(y_test, y_pred, digits=3))"
2102
   ]
2103
  },
2104
  {
2105
   "cell_type": "markdown",
2106
   "metadata": {},
2107
   "source": [
2108
    "Interpretation: Of the entire test set, 97% of patients have level cancer High."
2109
   ]
2110
  },
2111
  {
2112
   "cell_type": "markdown",
2113
   "metadata": {},
2114
   "source": [
2115
    "## Lets implement the same logistic regression using scikit learn\n",
2116
    "\n",
2117
    "---"
2118
   ]
2119
  },
2120
  {
2121
   "cell_type": "code",
2122
   "execution_count": 81,
2123
   "metadata": {},
2124
   "outputs": [
2125
    {
2126
     "name": "stdout",
2127
     "output_type": "stream",
2128
     "text": [
2129
      "3    365\n",
2130
      "2    332\n",
2131
      "1    303\n",
2132
      "Name: level, dtype: int64 \n",
2133
      "\n"
2134
     ]
2135
    }
2136
   ],
2137
   "source": [
2138
    "'''Remeber that 1 is Low, 2 is Medium, 3 is High'''\n",
2139
    "print (LUNG_CANCER['level'].value_counts(), \"\\n\" )"
2140
   ]
2141
  },
2142
  {
2143
   "cell_type": "code",
2144
   "execution_count": 82,
2145
   "metadata": {},
2146
   "outputs": [
2147
    {
2148
     "data": {
2149
      "text/plain": [
2150
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
2151
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
2152
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
2153
       "          verbose=0, warm_start=False)"
2154
      ]
2155
     },
2156
     "execution_count": 82,
2157
     "metadata": {},
2158
     "output_type": "execute_result"
2159
    }
2160
   ],
2161
   "source": [
2162
    "logistic = LogisticRegression()\n",
2163
    "logistic.fit(X_train, y_train)"
2164
   ]
2165
  },
2166
  {
2167
   "cell_type": "code",
2168
   "execution_count": 83,
2169
   "metadata": {},
2170
   "outputs": [
2171
    {
2172
     "name": "stdout",
2173
     "output_type": "stream",
2174
     "text": [
2175
      "[[ 76   0   0]\n",
2176
      " [  4  83   5]\n",
2177
      " [  0   0 132]]\n"
2178
     ]
2179
    },
2180
    {
2181
     "data": {
2182
      "text/html": [
2183
       "<div>\n",
2184
       "<style scoped>\n",
2185
       "    .dataframe tbody tr th:only-of-type {\n",
2186
       "        vertical-align: middle;\n",
2187
       "    }\n",
2188
       "\n",
2189
       "    .dataframe tbody tr th {\n",
2190
       "        vertical-align: top;\n",
2191
       "    }\n",
2192
       "\n",
2193
       "    .dataframe thead th {\n",
2194
       "        text-align: right;\n",
2195
       "    }\n",
2196
       "</style>\n",
2197
       "<table border=\"1\" class=\"dataframe\">\n",
2198
       "  <thead>\n",
2199
       "    <tr style=\"text-align: right;\">\n",
2200
       "      <th></th>\n",
2201
       "      <th>Predict_Label_0 Low</th>\n",
2202
       "      <th>Predict_Label_1 Medium</th>\n",
2203
       "      <th>Predict_Label_2 High</th>\n",
2204
       "    </tr>\n",
2205
       "  </thead>\n",
2206
       "  <tbody>\n",
2207
       "    <tr>\n",
2208
       "      <th>True_Label_0 Low</th>\n",
2209
       "      <td>76</td>\n",
2210
       "      <td>0</td>\n",
2211
       "      <td>0</td>\n",
2212
       "    </tr>\n",
2213
       "    <tr>\n",
2214
       "      <th>True_Label_1 Medium</th>\n",
2215
       "      <td>4</td>\n",
2216
       "      <td>83</td>\n",
2217
       "      <td>5</td>\n",
2218
       "    </tr>\n",
2219
       "    <tr>\n",
2220
       "      <th>True_Label_2 High</th>\n",
2221
       "      <td>0</td>\n",
2222
       "      <td>0</td>\n",
2223
       "      <td>132</td>\n",
2224
       "    </tr>\n",
2225
       "  </tbody>\n",
2226
       "</table>\n",
2227
       "</div>"
2228
      ],
2229
      "text/plain": [
2230
       "                     Predict_Label_0 Low  Predict_Label_1 Medium  \\\n",
2231
       "True_Label_0 Low                      76                       0   \n",
2232
       "True_Label_1 Medium                    4                      83   \n",
2233
       "True_Label_2 High                      0                       0   \n",
2234
       "\n",
2235
       "                     Predict_Label_2 High  \n",
2236
       "True_Label_0 Low                        0  \n",
2237
       "True_Label_1 Medium                     5  \n",
2238
       "True_Label_2 High                     132  "
2239
      ]
2240
     },
2241
     "execution_count": 83,
2242
     "metadata": {},
2243
     "output_type": "execute_result"
2244
    }
2245
   ],
2246
   "source": [
2247
    "from sklearn.metrics import confusion_matrix\n",
2248
    "y_pred=logistic.predict(X_test)\n",
2249
    "confusion_matrix = confusion_matrix(y_test, y_pred)\n",
2250
    "print(confusion_matrix)\n",
2251
    "confusion = pd.DataFrame(confusion_matrix,index=['True_Label_0 Low', 'True_Label_1 Medium','True_Label_2 High'],\n",
2252
    "                         columns=['Predict_Label_0 Low', 'Predict_Label_1 Medium','Predict_Label_2 High'])\n",
2253
    "\n",
2254
    "confusion"
2255
   ]
2256
  },
2257
  {
2258
   "cell_type": "code",
2259
   "execution_count": 84,
2260
   "metadata": {},
2261
   "outputs": [
2262
    {
2263
     "name": "stdout",
2264
     "output_type": "stream",
2265
     "text": [
2266
      "             precision    recall  f1-score   support\n",
2267
      "\n",
2268
      "          1      0.950     1.000     0.974        76\n",
2269
      "          2      1.000     0.902     0.949        92\n",
2270
      "          3      0.964     1.000     0.981       132\n",
2271
      "\n",
2272
      "avg / total      0.971     0.970     0.970       300\n",
2273
      "\n"
2274
     ]
2275
    }
2276
   ],
2277
   "source": [
2278
    "print (classification_report(y_test, y_pred, digits=3))"
2279
   ]
2280
  },
2281
  {
2282
   "cell_type": "markdown",
2283
   "metadata": {},
2284
   "source": [
2285
    "Interpretation: Of the entire test set, 97% of patients have level cancer High."
2286
   ]
2287
  },
2288
  {
2289
   "cell_type": "code",
2290
   "execution_count": 86,
2291
   "metadata": {},
2292
   "outputs": [
2293
    {
2294
     "data": {
2295
      "text/plain": [
2296
       "LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
2297
       "          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,\n",
2298
       "          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,\n",
2299
       "          verbose=0, warm_start=False)"
2300
      ]
2301
     },
2302
     "execution_count": 86,
2303
     "metadata": {},
2304
     "output_type": "execute_result"
2305
    }
2306
   ],
2307
   "source": [
2308
    "'''Use scikit learn'''\n",
2309
    "r_d_logistic = LogisticRegression()\n",
2310
    "r_d_logistic.fit(X_train, y_train)"
2311
   ]
2312
  },
2313
  {
2314
   "cell_type": "markdown",
2315
   "metadata": {},
2316
   "source": [
2317
    "### Calculate accuracy, Misclassification Rate (Error Rate), Precision,  Recall\n",
2318
    "---"
2319
   ]
2320
  },
2321
  {
2322
   "cell_type": "code",
2323
   "execution_count": 87,
2324
   "metadata": {},
2325
   "outputs": [
2326
    {
2327
     "name": "stdout",
2328
     "output_type": "stream",
2329
     "text": [
2330
      "Accuracy score: 97.000\n"
2331
     ]
2332
    }
2333
   ],
2334
   "source": [
2335
    "## Accuracy\n",
2336
    "## How often is the classifier correct?\n",
2337
    "from sklearn.metrics import accuracy_score\n",
2338
    "\n",
2339
    "acc = accuracy_score(y_test, y_pred)\n",
2340
    "print (\"Accuracy score: %.3f\" %(acc*100))"
2341
   ]
2342
  },
2343
  {
2344
   "cell_type": "code",
2345
   "execution_count": null,
2346
   "metadata": {},
2347
   "outputs": [],
2348
   "source": []
2349
  }
2350
 ],
2351
 "metadata": {
2352
  "kernelspec": {
2353
   "display_name": "Python 3",
2354
   "language": "python",
2355
   "name": "python3"
2356
  },
2357
  "language_info": {
2358
   "codemirror_mode": {
2359
    "name": "ipython",
2360
    "version": 3
2361
   },
2362
   "file_extension": ".py",
2363
   "mimetype": "text/x-python",
2364
   "name": "python",
2365
   "nbconvert_exporter": "python",
2366
   "pygments_lexer": "ipython3",
2367
   "version": "3.6.5"
2368
  }
2369
 },
2370
 "nbformat": 4,
2371
 "nbformat_minor": 1
2372
}