Switch to unified view

a b/notebooks/Data_analysis.ipynb
1
{
2
 "cells": [
3
  {
4
   "attachments": {},
5
   "cell_type": "markdown",
6
   "metadata": {},
7
   "source": [
8
    "# Data Analysis notebook\n",
9
    "\n",
10
    "P. Benveniste $^1$, J. Alberge $^1$\n",
11
    "\n",
12
    "$^1$ Ecole Normale Supérieure Paris-Saclay\n",
13
    "\n",
14
    "In this Notebook, we perform the analysis of the final datasets after preprocessing and feature extraction."
15
   ]
16
  },
17
  {
18
   "cell_type": "code",
19
   "execution_count": 1,
20
   "metadata": {},
21
   "outputs": [],
22
   "source": [
23
    "#Import of the librairies\n",
24
    "import pandas as pd\n",
25
    "import numpy as np\n",
26
    "import matplotlib.pyplot as plt\n",
27
    "from tabulate import tabulate"
28
   ]
29
  },
30
  {
31
   "attachments": {},
32
   "cell_type": "markdown",
33
   "metadata": {},
34
   "source": [
35
    "We now import both datasets."
36
   ]
37
  },
38
  {
39
   "cell_type": "code",
40
   "execution_count": 2,
41
   "metadata": {},
42
   "outputs": [
43
    {
44
     "name": "stdout",
45
     "output_type": "stream",
46
     "text": [
47
      "(55161, 10)\n",
48
      "(48595, 10)\n"
49
     ]
50
    }
51
   ],
52
   "source": [
53
    "#Loading of both datasets\n",
54
    "plco_file = './preprocessed_plco.csv'\n",
55
    "plco = pd.read_csv(plco_file)\n",
56
    "nlst_file = './preprocessed_nlst.csv'\n",
57
    "nlst = pd.read_csv(nlst_file)\n",
58
    "\n",
59
    "total_plco = len(plco)\n",
60
    "print(plco.shape)\n",
61
    "total_nlst = len(nlst)\n",
62
    "print(nlst.shape)"
63
   ]
64
  },
65
  {
66
   "attachments": {},
67
   "cell_type": "markdown",
68
   "metadata": {},
69
   "source": [
70
    "Now we perform data analysis for each of the following features:\n",
71
    "- `age`: This feature captures the person’s age.\n",
72
    "- `ssmokea_f`: This feature describes the age at which the person stopped smoking.\n",
73
    "- `cig_stat`: This feature describes if the person is a current or a former cigarette smoker at the beginning of the study.\n",
74
    "- `pack_years`: This feature refers to the number of packs smoked per day multiplied by the number of years during which the person smoked.\n",
75
    "- `smokea_f`: This feature indicates the age at which the person started smoking.\n",
76
    "- `cig_years`: This feature describes the total number of years during which the person smoked. \n",
77
    "- `lung_fh`:  This feature describes if the person has close family (parents, siblings or child) who had lung cancer.\n",
78
    "- `bmi`: This feature describes the person’s body mass index\n",
79
    "- `lung_cancer`: This feature indicates if the person was diagnosed with lung cancer."
80
   ]
81
  },
82
  {
83
   "cell_type": "code",
84
   "execution_count": 3,
85
   "metadata": {},
86
   "outputs": [
87
    {
88
     "name": "stdout",
89
     "output_type": "stream",
90
     "text": [
91
      "--------------  -----  ------  -----  ------\n",
92
      "Age             PLCO   PLCO %  NLST   NLST %\n",
93
      "<= 50           0      0.0     1      0.0\n",
94
      "50 < ... <= 60  27337  49.6    24861  51.2\n",
95
      "60 < ... <= 70  25120  45.5    20901  43.0\n",
96
      "> 70            2704   4.9     2832   5.8\n",
97
      "Missing         0      0.0     0      0.0\n",
98
      "--------------  -----  ------  -----  ------\n"
99
     ]
100
    }
101
   ],
102
   "source": [
103
    "table_age = [['Age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
104
    "                ['<= 50', plco[plco['age']<51]['age'].count(), round(plco[plco['age']<51]['age'].count() / total_plco * 100,1), nlst[nlst['age']<51]['age'].count(), round(nlst[nlst['age']<51]['age'].count() / total_nlst * 100,1)],\n",
105
    "                ['50 < ... <= 60',plco[(plco['age']>=51) & (plco['age']<61)]['age'].count(),  round(plco[(plco['age']>=51) & (plco['age']<61)]['age'].count()/ total_plco * 100,1), nlst[(nlst['age']>=51) & (nlst['age']<61)]['age'].count(), round(nlst[(nlst['age']>=51) & (nlst['age']<61)]['age'].count() / total_nlst * 100,1)],\n",
106
    "                ['60 < ... <= 70',plco[(plco['age']>=61) & (plco['age']<71)]['age'].count(), round(plco[(plco['age']>=61) & (plco['age']<71)]['age'].count() / total_plco * 100,1), nlst[(nlst['age']>=61) & (nlst['age']<71)]['age'].count(), round(nlst[(nlst['age']>=61) & (nlst['age']<71)]['age'].count() / total_nlst * 100,1)],\n",
107
    "                ['> 70',plco[(plco['age']>=71)]['age'].count(), round(plco[(plco['age']>=71)]['age'].count() / total_plco * 100,1), nlst[(nlst['age']>=71)]['age'].count(), round(nlst[(nlst['age']>=71)]['age'].count() / total_nlst * 100,1)],\n",
108
    "                ['Missing',plco['age'].isna().sum(), round(plco['age'].isna().sum() / total_plco * 100,1), nlst['age'].isna().sum(), round(nlst['age'].isna().sum() / total_nlst * 100,1)]]            \n",
109
    "print(tabulate(table_age))"
110
   ]
111
  },
112
  {
113
   "cell_type": "code",
114
   "execution_count": 4,
115
   "metadata": {},
116
   "outputs": [
117
    {
118
     "name": "stdout",
119
     "output_type": "stream",
120
     "text": [
121
      "---------------------  -----  ------  -----  ------\n",
122
      "Smoking cessation age  PLCO   PLCO %  NLST   NLST %\n",
123
      "<= 30                  10470  19.0    2      0.0\n",
124
      "30 < ... <= 40         11886  21.5    130    0.3\n",
125
      "40 < ... <= 50         11447  20.8    7025   14.5\n",
126
      "50 < ... <= 60         8649   15.7    14071  29.0\n",
127
      "> 60                   1942   3.5     4378   9.0\n",
128
      "Missing                10767  19.5    22989  47.3\n",
129
      "---------------------  -----  ------  -----  ------\n"
130
     ]
131
    }
132
   ],
133
   "source": [
134
    "table_ssmokea_f = [['Smoking cessation age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
135
    "                ['<= 30', plco[plco['ssmokea_f']<31]['ssmokea_f'].count(), round(plco[plco['ssmokea_f']<31]['ssmokea_f'].count() / total_plco * 100,1), nlst[nlst['ssmokea_f']<31]['ssmokea_f'].count(), round(nlst[nlst['ssmokea_f']<31]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
136
    "                ['30 < ... <= 40',plco[(plco['ssmokea_f']>=31) & (plco['ssmokea_f']<41)]['ssmokea_f'].count(),  round(plco[(plco['ssmokea_f']>=31) & (plco['ssmokea_f']<41)]['ssmokea_f'].count()/ total_plco * 100,1), nlst[(nlst['ssmokea_f']>=31) & (nlst['ssmokea_f']<41)]['ssmokea_f'].count(), round(nlst[(nlst['ssmokea_f']>=31) & (nlst['ssmokea_f']<41)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
137
    "                ['40 < ... <= 50',plco[(plco['ssmokea_f']>=41) & (plco['ssmokea_f']<51)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=41) & (plco['ssmokea_f']<51)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=41) & (nlst['ssmokea_f']<51)]['ssmokea_f'].count(),round(nlst[(nlst['ssmokea_f']>=41) & (nlst['ssmokea_f']<51)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
138
    "                ['50 < ... <= 60',plco[(plco['ssmokea_f']>=51) & (plco['ssmokea_f']<61)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=51) & (plco['ssmokea_f']<61)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=51) & (nlst['ssmokea_f']<61)]['ssmokea_f'].count(),round(nlst[(nlst['ssmokea_f']>=51) & (nlst['ssmokea_f']<61)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
139
    "                ['> 60',plco[(plco['ssmokea_f']>=61)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=61)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=61)]['ssmokea_f'].count(), round(nlst[(nlst['ssmokea_f']>=61)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
140
    "                ['Missing',plco['ssmokea_f'].isna().sum(), round(plco['ssmokea_f'].isna().sum() / total_plco * 100,1), nlst['ssmokea_f'].isna().sum(), round(nlst['ssmokea_f'].isna().sum() / total_nlst * 100,1)]]            \n",
141
    "print(tabulate(table_ssmokea_f))"
142
   ]
143
  },
144
  {
145
   "cell_type": "code",
146
   "execution_count": 5,
147
   "metadata": {},
148
   "outputs": [
149
    {
150
     "name": "stdout",
151
     "output_type": "stream",
152
     "text": [
153
      "--------------  -----  ------  -----  ------\n",
154
      "Smoking status  PLCO   PLCO %  NLST   NLST %\n",
155
      "Active          9965   18.1    22842  47.0\n",
156
      "Former          45196  81.9    25753  53.0\n",
157
      "Missing         0      0.0     0      0.0\n",
158
      "--------------  -----  ------  -----  ------\n"
159
     ]
160
    }
161
   ],
162
   "source": [
163
    "table_cig_stat = [['Smoking status', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
164
    "                ['Active', plco[plco['cig_stat']==1]['cig_stat'].count(),round(plco[plco['cig_stat']==1]['cig_stat'].count() / total_plco * 100,1), nlst[nlst['cig_stat']==1]['cig_stat'].count(), round(nlst[nlst['cig_stat']==1]['cig_stat'].count() / total_nlst * 100,1)],\n",
165
    "                ['Former', plco[plco['cig_stat']==2]['cig_stat'].count(),round(plco[plco['cig_stat']==2]['cig_stat'].count() / total_plco * 100,1), nlst[nlst['cig_stat']==2]['cig_stat'].count(), round(nlst[nlst['cig_stat']==2]['cig_stat'].count() / total_nlst * 100,1)],\n",
166
    "                ['Missing', plco['cig_stat'].isna().sum(), round(plco['cig_stat'].isna().sum()/total_plco*100,1), nlst['cig_stat'].isna().sum(), round(nlst['cig_stat'].isna().sum() / total_nlst*100,1)]]\n",
167
    "                         \n",
168
    "print(tabulate(table_cig_stat))"
169
   ]
170
  },
171
  {
172
   "cell_type": "code",
173
   "execution_count": 6,
174
   "metadata": {},
175
   "outputs": [
176
    {
177
     "name": "stdout",
178
     "output_type": "stream",
179
     "text": [
180
      "---------------  -----  ------  -----  ------\n",
181
      "Pack years       PLCO   PLCO %  NLST   NLST %\n",
182
      "<= 25            26981  48.9    8      0.0\n",
183
      "25 < ... <= 50   16147  29.3    26746  55.0\n",
184
      "50 < ... <= 100  9448   17.1    19544  40.2\n",
185
      "> 100            1434   2.6     2297   4.7\n",
186
      "Missing          1151   2.1     0      0.0\n",
187
      "---------------  -----  ------  -----  ------\n"
188
     ]
189
    }
190
   ],
191
   "source": [
192
    "table_pack_years = [['Pack years', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
193
    "                ['<= 25', plco[plco['pack_years']<26]['pack_years'].count(), round(plco[plco['pack_years']<26]['pack_years'].count() / total_plco * 100,1), nlst[nlst['pack_years']<26]['pack_years'].count(), round(nlst[nlst['pack_years']<26]['pack_years'].count() / total_nlst * 100,1)],\n",
194
    "                ['25 < ... <= 50',plco[(plco['pack_years']>=26) & (plco['pack_years']<51)]['pack_years'].count(), round(plco[(plco['pack_years']>=26) & (plco['pack_years']<51)]['pack_years'].count() / total_plco * 100,1), nlst[(nlst['pack_years']>=26) & (nlst['pack_years']<51)]['pack_years'].count(),round(nlst[(nlst['pack_years']>=26) & (nlst['pack_years']<51)]['pack_years'].count() / total_nlst * 100,1)],\n",
195
    "                ['50 < ... <= 100',plco[(plco['pack_years']>=51) & (plco['pack_years']<101)]['pack_years'].count(),  round(plco[(plco['pack_years']>=51) & (plco['pack_years']<101)]['pack_years'].count()/ total_plco * 100,1), nlst[(nlst['pack_years']>=51) & (nlst['pack_years']<101)]['pack_years'].count(), round(nlst[(nlst['pack_years']>=51) & (nlst['pack_years']<101)]['pack_years'].count() / total_nlst * 100,1)],\n",
196
    "                ['> 100',plco[(plco['pack_years']>=101)]['pack_years'].count(), round(plco[(plco['pack_years']>=101)]['pack_years'].count() / total_plco * 100,1), nlst[(nlst['pack_years']>=101)]['pack_years'].count(), round(nlst[(nlst['pack_years']>=101)]['pack_years'].count() / total_nlst * 100,1)],\n",
197
    "                ['Missing',plco['pack_years'].isna().sum(), round(plco['pack_years'].isna().sum() / total_plco * 100,1), nlst['pack_years'].isna().sum(), round(nlst['pack_years'].isna().sum() / total_nlst * 100,1)]]            \n",
198
    "print(tabulate(table_pack_years))"
199
   ]
200
  },
201
  {
202
   "cell_type": "code",
203
   "execution_count": 7,
204
   "metadata": {},
205
   "outputs": [
206
    {
207
     "name": "stdout",
208
     "output_type": "stream",
209
     "text": [
210
      "-----------------  -----  ------  -----  ------\n",
211
      "Smoking onset age  PLCO   PLCO %  NLST   NLST %\n",
212
      "<= 15              10169  18.4    17927  36.9\n",
213
      "15 < ... <= 20     33760  61.2    25411  52.3\n",
214
      "> 20               10950  19.9    5256   10.8\n",
215
      "Missing            282    0.5     1      0.0\n",
216
      "-----------------  -----  ------  -----  ------\n"
217
     ]
218
    }
219
   ],
220
   "source": [
221
    "table_smokea_f = [['Smoking onset age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
222
    "                ['<= 15', plco[plco['smokea_f']<16]['smokea_f'].count(), round(plco[plco['smokea_f']<16]['smokea_f'].count() / total_plco * 100,1), nlst[nlst['smokea_f']<16]['smokea_f'].count(), round(nlst[nlst['smokea_f']<16]['smokea_f'].count() / total_nlst * 100,1)],\n",
223
    "                ['15 < ... <= 20',plco[(plco['smokea_f']>=16) & (plco['smokea_f']<21)]['smokea_f'].count(),  round(plco[(plco['smokea_f']>=16) & (plco['smokea_f']<21)]['smokea_f'].count()/ total_plco * 100,1), nlst[(nlst['smokea_f']>=16) & (nlst['smokea_f']<21)]['smokea_f'].count(), round(nlst[(nlst['smokea_f']>=16) & (nlst['smokea_f']<21)]['smokea_f'].count() / total_nlst * 100,1)],\n",
224
    "                ['> 20',plco[(plco['smokea_f']>=21)]['smokea_f'].count(), round(plco[(plco['smokea_f']>=21)]['smokea_f'].count() / total_plco * 100,1), nlst[(nlst['smokea_f']>=21)]['smokea_f'].count(), round(nlst[(nlst['smokea_f']>=21)]['smokea_f'].count() / total_nlst * 100,1)],\n",
225
    "                ['Missing',plco['smokea_f'].isna().sum(), round(plco['smokea_f'].isna().sum() / total_plco * 100,1), nlst['smokea_f'].isna().sum(), round(nlst['smokea_f'].isna().sum() / total_nlst * 100,1)]]            \n",
226
    "print(tabulate(table_smokea_f))"
227
   ]
228
  },
229
  {
230
   "cell_type": "code",
231
   "execution_count": 8,
232
   "metadata": {},
233
   "outputs": [
234
    {
235
     "name": "stdout",
236
     "output_type": "stream",
237
     "text": [
238
      "--------------  -----  ------  -----  ------\n",
239
      "Smoking years   PLCO   PLCO %  NLST   NLST %\n",
240
      "<= 10           8800   16.0    2      0.0\n",
241
      "10 < ... <= 20  11761  21.3    292    0.6\n",
242
      "20 < ... <= 30  11532  20.9    5134   10.6\n",
243
      "30 < ... <= 40  13037  23.6    21620  44.5\n",
244
      "> 40            8963   16.2    21547  44.3\n",
245
      "Missing         1068   1.9     0      0.0\n",
246
      "--------------  -----  ------  -----  ------\n"
247
     ]
248
    }
249
   ],
250
   "source": [
251
    "table_cig_years = [['Smoking years', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
252
    "                ['<= 10', plco[plco['cig_years']<11]['cig_years'].count(), round(plco[plco['cig_years']<11]['cig_years'].count() / total_plco * 100,1), nlst[nlst['cig_years']<11]['cig_years'].count(), round(nlst[nlst['cig_years']<11]['cig_years'].count() / total_nlst * 100,1)],\n",
253
    "                ['10 < ... <= 20',plco[(plco['cig_years']>=11) & (plco['cig_years']<21)]['cig_years'].count(),  round(plco[(plco['cig_years']>=11) & (plco['cig_years']<21)]['cig_years'].count()/ total_plco * 100,1), nlst[(nlst['cig_years']>=11) & (nlst['cig_years']<21)]['cig_years'].count(), round(nlst[(nlst['cig_years']>=11) & (nlst['cig_years']<21)]['cig_years'].count() / total_nlst * 100,1)],\n",
254
    "                ['20 < ... <= 30',plco[(plco['cig_years']>=21) & (plco['cig_years']<31)]['cig_years'].count(), round(plco[(plco['cig_years']>=21) & (plco['cig_years']<31)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=21) & (nlst['cig_years']<31)]['cig_years'].count(),round(nlst[(nlst['cig_years']>=21) & (nlst['cig_years']<31)]['cig_years'].count() / total_nlst * 100,1)],\n",
255
    "                ['30 < ... <= 40',plco[(plco['cig_years']>=31) & (plco['cig_years']<41)]['cig_years'].count(), round(plco[(plco['cig_years']>=31) & (plco['cig_years']<41)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=31) & (nlst['cig_years']<41)]['cig_years'].count(),round(nlst[(nlst['cig_years']>=31) & (nlst['cig_years']<41)]['cig_years'].count() / total_nlst * 100,1)],\n",
256
    "                ['> 40',plco[(plco['cig_years']>=41)]['cig_years'].count(), round(plco[(plco['cig_years']>=41)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=41)]['cig_years'].count(), round(nlst[(nlst['cig_years']>=41)]['cig_years'].count() / total_nlst * 100,1)],\n",
257
    "                ['Missing',plco['cig_years'].isna().sum(), round(plco['cig_years'].isna().sum() / total_plco * 100,1), nlst['cig_years'].isna().sum(), round(nlst['cig_years'].isna().sum() / total_nlst * 100,1)]]            \n",
258
    "print(tabulate(table_cig_years))"
259
   ]
260
  },
261
  {
262
   "cell_type": "code",
263
   "execution_count": 9,
264
   "metadata": {},
265
   "outputs": [
266
    {
267
     "name": "stdout",
268
     "output_type": "stream",
269
     "text": [
270
      "--------------------------  -----  ------  -----  ------\n",
271
      "Lung cancer family history  PLCO   PLCO %  NLST   NLST %\n",
272
      "No                          48415  87.8    37302  76.8\n",
273
      "Yes                         6323   11.5    10598  21.8\n",
274
      "Missing                     423    0.8     695    1.4\n",
275
      "--------------------------  -----  ------  -----  ------\n"
276
     ]
277
    }
278
   ],
279
   "source": [
280
    "table_lung_fh = [['Lung cancer family history', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
281
    "                ['No', plco[plco['lung_fh']==0]['lung_fh'].count(),round(plco[plco['lung_fh']==0]['lung_fh'].count() / total_plco * 100,1), nlst[nlst['lung_fh']==0]['lung_fh'].count(), round(nlst[nlst['lung_fh']==0]['lung_fh'].count() / total_nlst * 100,1)],\n",
282
    "                ['Yes', plco[plco['lung_fh']==1]['lung_fh'].count(),round(plco[plco['lung_fh']==1]['lung_fh'].count() / total_plco * 100,1), nlst[nlst['lung_fh']==1]['lung_fh'].count(), round(nlst[nlst['lung_fh']==1]['lung_fh'].count() / total_nlst * 100,1)],\n",
283
    "                ['Missing', plco['lung_fh'].isna().sum(), round(plco['lung_fh'].isna().sum()/total_plco*100,1), nlst['lung_fh'].isna().sum(), round(nlst['lung_fh'].isna().sum() / total_nlst*100,1)]]\n",
284
    "print(tabulate(table_lung_fh))"
285
   ]
286
  },
287
  {
288
   "cell_type": "code",
289
   "execution_count": 10,
290
   "metadata": {},
291
   "outputs": [
292
    {
293
     "name": "stdout",
294
     "output_type": "stream",
295
     "text": [
296
      "------------------------------------  -----  ------  -----  ------\n",
297
      "Body Mass Index                       PLCO   PLCO %  NLST   NLST %\n",
298
      "Underweight (... <= 18.4)             295    0.5     347    0.7\n",
299
      "Healthy weight (18.5 <= ... <= 24.9)  17556  31.8    13404  27.6\n",
300
      "Overweight (25 <= ... <= 29.9)        23920  43.4    20894  43.0\n",
301
      "Obesity (... >= 30)                   12631  22.9    13696  28.2\n",
302
      "Missing                               759    1.4     234    0.5\n",
303
      "------------------------------------  -----  ------  -----  ------\n"
304
     ]
305
    }
306
   ],
307
   "source": [
308
    "table_bmi = [['Body Mass Index', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
309
    "                ['Underweight (... <= 18.4)', plco[plco['bmi']<18.5]['bmi'].count(), round(plco[plco['bmi']<18.4]['bmi'].count() / total_plco * 100,1), nlst[nlst['bmi']<18.4]['bmi'].count(), round(nlst[nlst['bmi']<18.4]['bmi'].count() / total_nlst * 100,1)],\n",
310
    "                ['Healthy weight (18.5 <= ... <= 24.9)',plco[(plco['bmi']>=18.5) & (plco['bmi']<25)]['bmi'].count(),  round(plco[(plco['bmi']>=18.5) & (plco['bmi']<25)]['bmi'].count()/ total_plco * 100,1), nlst[(nlst['bmi']>=18.5) & (nlst['bmi']<25)]['bmi'].count(), round(nlst[(nlst['bmi']>=18.5) & (nlst['bmi']<25)]['bmi'].count() / total_nlst * 100,1)],\n",
311
    "                ['Overweight (25 <= ... <= 29.9)',plco[(plco['bmi']>=25) & (plco['bmi']<30)]['bmi'].count(), round(plco[(plco['bmi']>=25) & (plco['bmi']<30)]['bmi'].count() / total_plco * 100,1), nlst[(nlst['bmi']>=25) & (nlst['bmi']<30)]['bmi'].count(),round(nlst[(nlst['bmi']>=25) & (nlst['bmi']<30)]['bmi'].count() / total_nlst * 100,1)],\n",
312
    "                ['Obesity (... >= 30)',plco[(plco['bmi']>=30)]['bmi'].count(), round(plco[(plco['bmi']>=30)]['bmi'].count() / total_plco * 100,1), nlst[(nlst['bmi']>=30)]['bmi'].count(), round(nlst[(nlst['bmi']>=30)]['bmi'].count() / total_nlst * 100,1)],\n",
313
    "                ['Missing',plco['bmi'].isna().sum(), round(plco['bmi'].isna().sum() / total_plco * 100,1), nlst['bmi'].isna().sum(), round(nlst['bmi'].isna().sum() / total_nlst * 100,1)]]            \n",
314
    "print(tabulate(table_bmi))"
315
   ]
316
  },
317
  {
318
   "cell_type": "code",
319
   "execution_count": 11,
320
   "metadata": {},
321
   "outputs": [
322
    {
323
     "name": "stdout",
324
     "output_type": "stream",
325
     "text": [
326
      "-----------  -----  ------  -----  ------\n",
327
      "Lung cancer  PLCO   PLCO %  NLST   NLST %\n",
328
      "Negative     52409  95.0    47084  96.9\n",
329
      "Positive     2752   5.0     1511   3.1\n",
330
      "Missing      0      0.0     0      0.0\n",
331
      "-----------  -----  ------  -----  ------\n"
332
     ]
333
    }
334
   ],
335
   "source": [
336
    "table_lung_cancer = [['Lung cancer', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
337
    "                ['Negative', plco[plco['lung_cancer']==0]['lung_cancer'].count(),round(plco[plco['lung_cancer']==0]['lung_cancer'].count() / total_plco * 100,1), nlst[nlst['lung_cancer']==0]['lung_cancer'].count(), round(nlst[nlst['lung_cancer']==0]['lung_cancer'].count() / total_nlst * 100,1)],\n",
338
    "                ['Positive', plco[plco['lung_cancer']==1]['lung_cancer'].count(),round(plco[plco['lung_cancer']==1]['lung_cancer'].count() / total_plco * 100,1), nlst[nlst['lung_cancer']==1]['lung_cancer'].count(), round(nlst[nlst['lung_cancer']==1]['lung_cancer'].count() / total_nlst * 100,1)],\n",
339
    "                ['Missing', plco['lung_cancer'].isna().sum(), round(plco['lung_cancer'].isna().sum()/total_plco*100,1), nlst['lung_cancer'].isna().sum(), round(nlst['lung_cancer'].isna().sum() / total_nlst*100,1)]]\n",
340
    "print(tabulate(table_lung_cancer))"
341
   ]
342
  },
343
  {
344
   "attachments": {},
345
   "cell_type": "markdown",
346
   "metadata": {},
347
   "source": [
348
    "### Saving a txt file\n",
349
    "\n",
350
    "Now we write a text file to concatenate these analyses. "
351
   ]
352
  },
353
  {
354
   "cell_type": "code",
355
   "execution_count": 12,
356
   "metadata": {},
357
   "outputs": [
358
    {
359
     "name": "stdout",
360
     "output_type": "stream",
361
     "text": [
362
      "File edited\n"
363
     ]
364
    }
365
   ],
366
   "source": [
367
    "with open('./data_analysis.txt', 'w') as f:\n",
368
    "    f.write('------------ PRE-PROCESSED DATA ANALYSIS ------------ \\n \\n')\n",
369
    "    f.write('We perform data analysis on each features of the PLCO and NLST dataset.\\n')\n",
370
    "    f.write('Number of participants: \\n')\n",
371
    "    f.write('  - PLCO: ' + str(total_plco) + '\\n')\n",
372
    "    f.write('  - NLST: ' + str(total_nlst) + '\\n \\n')\n",
373
    "    f.write('--- Feature analysis --- \\n\\n')\n",
374
    "    f.write('Age: This feature captures the person’s age. \\n')\n",
375
    "    f.write(tabulate(table_age))\n",
376
    "    f.write('\\n\\n')\n",
377
    "    f.write('Smoking cessation age: This feature describes the age at which the person stopped smoking. \\n')\n",
378
    "    f.write(tabulate(table_ssmokea_f))\n",
379
    "    f.write('\\n\\n')\n",
380
    "    f.write('Smoking status: This feature describes if the person is a current or a former cigarette smoker at the beginning of the study. \\n')\n",
381
    "    f.write(tabulate(table_cig_stat))\n",
382
    "    f.write('\\n\\n')\n",
383
    "    f.write('Pack-years: This feature refers to the number of packs smoked per day multiplied by the number of years during which the person smoked. \\n')\n",
384
    "    f.write(tabulate(table_pack_years))\n",
385
    "    f.write('\\n\\n')\n",
386
    "    f.write('Smoking onset age: This feature indicates the age at which the person started smoking. \\n')\n",
387
    "    f.write(tabulate(table_smokea_f))\n",
388
    "    f.write('\\n\\n')\n",
389
    "    f.write('Years smoked: This feature describes the total number of years during which the person smoked. \\n')\n",
390
    "    f.write(tabulate(table_cig_years))\n",
391
    "    f.write('\\n\\n')\n",
392
    "    f.write('Lung family history: This feature describes if the person has close family (parents, siblings or child) who had lung cancer. \\n')\n",
393
    "    f.write(tabulate(table_lung_fh))\n",
394
    "    f.write('\\n\\n')\n",
395
    "    f.write('BMI: This feature describes the person’s body mass index. \\n')\n",
396
    "    f.write(tabulate(table_bmi))\n",
397
    "    f.write('\\n\\n')\n",
398
    "    f.write('Lung cancer: This feature indicates if the person was diagnosed with lung cancer. \\n')\n",
399
    "    f.write(tabulate(table_lung_cancer))\n",
400
    "    f.write('\\n\\n\\n')\n",
401
    "print(\"File edited\")"
402
   ]
403
  }
404
 ],
405
 "metadata": {
406
  "kernelspec": {
407
   "display_name": ".venv",
408
   "language": "python",
409
   "name": "python3"
410
  },
411
  "language_info": {
412
   "codemirror_mode": {
413
    "name": "ipython",
414
    "version": 3
415
   },
416
   "file_extension": ".py",
417
   "mimetype": "text/x-python",
418
   "name": "python",
419
   "nbconvert_exporter": "python",
420
   "pygments_lexer": "ipython3",
421
   "version": "3.8.9"
422
  },
423
  "orig_nbformat": 4
424
 },
425
 "nbformat": 4,
426
 "nbformat_minor": 2
427
}