--- a
+++ b/notebooks/Data_analysis.ipynb
@@ -0,0 +1,427 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Data Analysis notebook\n",
+    "\n",
+    "P. Benveniste $^1$, J. Alberge $^1$\n",
+    "\n",
+    "$^1$ Ecole Normale Supérieure Paris-Saclay\n",
+    "\n",
+    "In this Notebook, we perform the analysis of the final datasets after preprocessing and feature extraction."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Import of the librairies\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from tabulate import tabulate"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now import both datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(55161, 10)\n",
+      "(48595, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Loading of both datasets\n",
+    "plco_file = './preprocessed_plco.csv'\n",
+    "plco = pd.read_csv(plco_file)\n",
+    "nlst_file = './preprocessed_nlst.csv'\n",
+    "nlst = pd.read_csv(nlst_file)\n",
+    "\n",
+    "total_plco = len(plco)\n",
+    "print(plco.shape)\n",
+    "total_nlst = len(nlst)\n",
+    "print(nlst.shape)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we perform data analysis for each of the following features:\n",
+    "- `age`: This feature captures the person’s age.\n",
+    "- `ssmokea_f`: This feature describes the age at which the person stopped smoking.\n",
+    "- `cig_stat`: This feature describes if the person is a current or a former cigarette smoker at the beginning of the study.\n",
+    "- `pack_years`: This feature refers to the number of packs smoked per day multiplied by the number of years during which the person smoked.\n",
+    "- `smokea_f`: This feature indicates the age at which the person started smoking.\n",
+    "- `cig_years`: This feature describes the total number of years during which the person smoked. \n",
+    "- `lung_fh`:  This feature describes if the person has close family (parents, siblings or child) who had lung cancer.\n",
+    "- `bmi`: This feature describes the person’s body mass index\n",
+    "- `lung_cancer`: This feature indicates if the person was diagnosed with lung cancer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------  -----  ------  -----  ------\n",
+      "Age             PLCO   PLCO %  NLST   NLST %\n",
+      "<= 50           0      0.0     1      0.0\n",
+      "50 < ... <= 60  27337  49.6    24861  51.2\n",
+      "60 < ... <= 70  25120  45.5    20901  43.0\n",
+      "> 70            2704   4.9     2832   5.8\n",
+      "Missing         0      0.0     0      0.0\n",
+      "--------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_age = [['Age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['<= 50', plco[plco['age']<51]['age'].count(), round(plco[plco['age']<51]['age'].count() / total_plco * 100,1), nlst[nlst['age']<51]['age'].count(), round(nlst[nlst['age']<51]['age'].count() / total_nlst * 100,1)],\n",
+    "                ['50 < ... <= 60',plco[(plco['age']>=51) & (plco['age']<61)]['age'].count(),  round(plco[(plco['age']>=51) & (plco['age']<61)]['age'].count()/ total_plco * 100,1), nlst[(nlst['age']>=51) & (nlst['age']<61)]['age'].count(), round(nlst[(nlst['age']>=51) & (nlst['age']<61)]['age'].count() / total_nlst * 100,1)],\n",
+    "                ['60 < ... <= 70',plco[(plco['age']>=61) & (plco['age']<71)]['age'].count(), round(plco[(plco['age']>=61) & (plco['age']<71)]['age'].count() / total_plco * 100,1), nlst[(nlst['age']>=61) & (nlst['age']<71)]['age'].count(), round(nlst[(nlst['age']>=61) & (nlst['age']<71)]['age'].count() / total_nlst * 100,1)],\n",
+    "                ['> 70',plco[(plco['age']>=71)]['age'].count(), round(plco[(plco['age']>=71)]['age'].count() / total_plco * 100,1), nlst[(nlst['age']>=71)]['age'].count(), round(nlst[(nlst['age']>=71)]['age'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing',plco['age'].isna().sum(), round(plco['age'].isna().sum() / total_plco * 100,1), nlst['age'].isna().sum(), round(nlst['age'].isna().sum() / total_nlst * 100,1)]]            \n",
+    "print(tabulate(table_age))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------------  -----  ------  -----  ------\n",
+      "Smoking cessation age  PLCO   PLCO %  NLST   NLST %\n",
+      "<= 30                  10470  19.0    2      0.0\n",
+      "30 < ... <= 40         11886  21.5    130    0.3\n",
+      "40 < ... <= 50         11447  20.8    7025   14.5\n",
+      "50 < ... <= 60         8649   15.7    14071  29.0\n",
+      "> 60                   1942   3.5     4378   9.0\n",
+      "Missing                10767  19.5    22989  47.3\n",
+      "---------------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_ssmokea_f = [['Smoking cessation age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['<= 30', plco[plco['ssmokea_f']<31]['ssmokea_f'].count(), round(plco[plco['ssmokea_f']<31]['ssmokea_f'].count() / total_plco * 100,1), nlst[nlst['ssmokea_f']<31]['ssmokea_f'].count(), round(nlst[nlst['ssmokea_f']<31]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['30 < ... <= 40',plco[(plco['ssmokea_f']>=31) & (plco['ssmokea_f']<41)]['ssmokea_f'].count(),  round(plco[(plco['ssmokea_f']>=31) & (plco['ssmokea_f']<41)]['ssmokea_f'].count()/ total_plco * 100,1), nlst[(nlst['ssmokea_f']>=31) & (nlst['ssmokea_f']<41)]['ssmokea_f'].count(), round(nlst[(nlst['ssmokea_f']>=31) & (nlst['ssmokea_f']<41)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['40 < ... <= 50',plco[(plco['ssmokea_f']>=41) & (plco['ssmokea_f']<51)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=41) & (plco['ssmokea_f']<51)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=41) & (nlst['ssmokea_f']<51)]['ssmokea_f'].count(),round(nlst[(nlst['ssmokea_f']>=41) & (nlst['ssmokea_f']<51)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['50 < ... <= 60',plco[(plco['ssmokea_f']>=51) & (plco['ssmokea_f']<61)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=51) & (plco['ssmokea_f']<61)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=51) & (nlst['ssmokea_f']<61)]['ssmokea_f'].count(),round(nlst[(nlst['ssmokea_f']>=51) & (nlst['ssmokea_f']<61)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['> 60',plco[(plco['ssmokea_f']>=61)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=61)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=61)]['ssmokea_f'].count(), round(nlst[(nlst['ssmokea_f']>=61)]['ssmokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing',plco['ssmokea_f'].isna().sum(), round(plco['ssmokea_f'].isna().sum() / total_plco * 100,1), nlst['ssmokea_f'].isna().sum(), round(nlst['ssmokea_f'].isna().sum() / total_nlst * 100,1)]]            \n",
+    "print(tabulate(table_ssmokea_f))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------  -----  ------  -----  ------\n",
+      "Smoking status  PLCO   PLCO %  NLST   NLST %\n",
+      "Active          9965   18.1    22842  47.0\n",
+      "Former          45196  81.9    25753  53.0\n",
+      "Missing         0      0.0     0      0.0\n",
+      "--------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_cig_stat = [['Smoking status', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['Active', plco[plco['cig_stat']==1]['cig_stat'].count(),round(plco[plco['cig_stat']==1]['cig_stat'].count() / total_plco * 100,1), nlst[nlst['cig_stat']==1]['cig_stat'].count(), round(nlst[nlst['cig_stat']==1]['cig_stat'].count() / total_nlst * 100,1)],\n",
+    "                ['Former', plco[plco['cig_stat']==2]['cig_stat'].count(),round(plco[plco['cig_stat']==2]['cig_stat'].count() / total_plco * 100,1), nlst[nlst['cig_stat']==2]['cig_stat'].count(), round(nlst[nlst['cig_stat']==2]['cig_stat'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing', plco['cig_stat'].isna().sum(), round(plco['cig_stat'].isna().sum()/total_plco*100,1), nlst['cig_stat'].isna().sum(), round(nlst['cig_stat'].isna().sum() / total_nlst*100,1)]]\n",
+    "                         \n",
+    "print(tabulate(table_cig_stat))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "---------------  -----  ------  -----  ------\n",
+      "Pack years       PLCO   PLCO %  NLST   NLST %\n",
+      "<= 25            26981  48.9    8      0.0\n",
+      "25 < ... <= 50   16147  29.3    26746  55.0\n",
+      "50 < ... <= 100  9448   17.1    19544  40.2\n",
+      "> 100            1434   2.6     2297   4.7\n",
+      "Missing          1151   2.1     0      0.0\n",
+      "---------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_pack_years = [['Pack years', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['<= 25', plco[plco['pack_years']<26]['pack_years'].count(), round(plco[plco['pack_years']<26]['pack_years'].count() / total_plco * 100,1), nlst[nlst['pack_years']<26]['pack_years'].count(), round(nlst[nlst['pack_years']<26]['pack_years'].count() / total_nlst * 100,1)],\n",
+    "                ['25 < ... <= 50',plco[(plco['pack_years']>=26) & (plco['pack_years']<51)]['pack_years'].count(), round(plco[(plco['pack_years']>=26) & (plco['pack_years']<51)]['pack_years'].count() / total_plco * 100,1), nlst[(nlst['pack_years']>=26) & (nlst['pack_years']<51)]['pack_years'].count(),round(nlst[(nlst['pack_years']>=26) & (nlst['pack_years']<51)]['pack_years'].count() / total_nlst * 100,1)],\n",
+    "                ['50 < ... <= 100',plco[(plco['pack_years']>=51) & (plco['pack_years']<101)]['pack_years'].count(),  round(plco[(plco['pack_years']>=51) & (plco['pack_years']<101)]['pack_years'].count()/ total_plco * 100,1), nlst[(nlst['pack_years']>=51) & (nlst['pack_years']<101)]['pack_years'].count(), round(nlst[(nlst['pack_years']>=51) & (nlst['pack_years']<101)]['pack_years'].count() / total_nlst * 100,1)],\n",
+    "                ['> 100',plco[(plco['pack_years']>=101)]['pack_years'].count(), round(plco[(plco['pack_years']>=101)]['pack_years'].count() / total_plco * 100,1), nlst[(nlst['pack_years']>=101)]['pack_years'].count(), round(nlst[(nlst['pack_years']>=101)]['pack_years'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing',plco['pack_years'].isna().sum(), round(plco['pack_years'].isna().sum() / total_plco * 100,1), nlst['pack_years'].isna().sum(), round(nlst['pack_years'].isna().sum() / total_nlst * 100,1)]]            \n",
+    "print(tabulate(table_pack_years))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-----------------  -----  ------  -----  ------\n",
+      "Smoking onset age  PLCO   PLCO %  NLST   NLST %\n",
+      "<= 15              10169  18.4    17927  36.9\n",
+      "15 < ... <= 20     33760  61.2    25411  52.3\n",
+      "> 20               10950  19.9    5256   10.8\n",
+      "Missing            282    0.5     1      0.0\n",
+      "-----------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_smokea_f = [['Smoking onset age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['<= 15', plco[plco['smokea_f']<16]['smokea_f'].count(), round(plco[plco['smokea_f']<16]['smokea_f'].count() / total_plco * 100,1), nlst[nlst['smokea_f']<16]['smokea_f'].count(), round(nlst[nlst['smokea_f']<16]['smokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['15 < ... <= 20',plco[(plco['smokea_f']>=16) & (plco['smokea_f']<21)]['smokea_f'].count(),  round(plco[(plco['smokea_f']>=16) & (plco['smokea_f']<21)]['smokea_f'].count()/ total_plco * 100,1), nlst[(nlst['smokea_f']>=16) & (nlst['smokea_f']<21)]['smokea_f'].count(), round(nlst[(nlst['smokea_f']>=16) & (nlst['smokea_f']<21)]['smokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['> 20',plco[(plco['smokea_f']>=21)]['smokea_f'].count(), round(plco[(plco['smokea_f']>=21)]['smokea_f'].count() / total_plco * 100,1), nlst[(nlst['smokea_f']>=21)]['smokea_f'].count(), round(nlst[(nlst['smokea_f']>=21)]['smokea_f'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing',plco['smokea_f'].isna().sum(), round(plco['smokea_f'].isna().sum() / total_plco * 100,1), nlst['smokea_f'].isna().sum(), round(nlst['smokea_f'].isna().sum() / total_nlst * 100,1)]]            \n",
+    "print(tabulate(table_smokea_f))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------  -----  ------  -----  ------\n",
+      "Smoking years   PLCO   PLCO %  NLST   NLST %\n",
+      "<= 10           8800   16.0    2      0.0\n",
+      "10 < ... <= 20  11761  21.3    292    0.6\n",
+      "20 < ... <= 30  11532  20.9    5134   10.6\n",
+      "30 < ... <= 40  13037  23.6    21620  44.5\n",
+      "> 40            8963   16.2    21547  44.3\n",
+      "Missing         1068   1.9     0      0.0\n",
+      "--------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_cig_years = [['Smoking years', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['<= 10', plco[plco['cig_years']<11]['cig_years'].count(), round(plco[plco['cig_years']<11]['cig_years'].count() / total_plco * 100,1), nlst[nlst['cig_years']<11]['cig_years'].count(), round(nlst[nlst['cig_years']<11]['cig_years'].count() / total_nlst * 100,1)],\n",
+    "                ['10 < ... <= 20',plco[(plco['cig_years']>=11) & (plco['cig_years']<21)]['cig_years'].count(),  round(plco[(plco['cig_years']>=11) & (plco['cig_years']<21)]['cig_years'].count()/ total_plco * 100,1), nlst[(nlst['cig_years']>=11) & (nlst['cig_years']<21)]['cig_years'].count(), round(nlst[(nlst['cig_years']>=11) & (nlst['cig_years']<21)]['cig_years'].count() / total_nlst * 100,1)],\n",
+    "                ['20 < ... <= 30',plco[(plco['cig_years']>=21) & (plco['cig_years']<31)]['cig_years'].count(), round(plco[(plco['cig_years']>=21) & (plco['cig_years']<31)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=21) & (nlst['cig_years']<31)]['cig_years'].count(),round(nlst[(nlst['cig_years']>=21) & (nlst['cig_years']<31)]['cig_years'].count() / total_nlst * 100,1)],\n",
+    "                ['30 < ... <= 40',plco[(plco['cig_years']>=31) & (plco['cig_years']<41)]['cig_years'].count(), round(plco[(plco['cig_years']>=31) & (plco['cig_years']<41)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=31) & (nlst['cig_years']<41)]['cig_years'].count(),round(nlst[(nlst['cig_years']>=31) & (nlst['cig_years']<41)]['cig_years'].count() / total_nlst * 100,1)],\n",
+    "                ['> 40',plco[(plco['cig_years']>=41)]['cig_years'].count(), round(plco[(plco['cig_years']>=41)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=41)]['cig_years'].count(), round(nlst[(nlst['cig_years']>=41)]['cig_years'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing',plco['cig_years'].isna().sum(), round(plco['cig_years'].isna().sum() / total_plco * 100,1), nlst['cig_years'].isna().sum(), round(nlst['cig_years'].isna().sum() / total_nlst * 100,1)]]            \n",
+    "print(tabulate(table_cig_years))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--------------------------  -----  ------  -----  ------\n",
+      "Lung cancer family history  PLCO   PLCO %  NLST   NLST %\n",
+      "No                          48415  87.8    37302  76.8\n",
+      "Yes                         6323   11.5    10598  21.8\n",
+      "Missing                     423    0.8     695    1.4\n",
+      "--------------------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_lung_fh = [['Lung cancer family history', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['No', plco[plco['lung_fh']==0]['lung_fh'].count(),round(plco[plco['lung_fh']==0]['lung_fh'].count() / total_plco * 100,1), nlst[nlst['lung_fh']==0]['lung_fh'].count(), round(nlst[nlst['lung_fh']==0]['lung_fh'].count() / total_nlst * 100,1)],\n",
+    "                ['Yes', plco[plco['lung_fh']==1]['lung_fh'].count(),round(plco[plco['lung_fh']==1]['lung_fh'].count() / total_plco * 100,1), nlst[nlst['lung_fh']==1]['lung_fh'].count(), round(nlst[nlst['lung_fh']==1]['lung_fh'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing', plco['lung_fh'].isna().sum(), round(plco['lung_fh'].isna().sum()/total_plco*100,1), nlst['lung_fh'].isna().sum(), round(nlst['lung_fh'].isna().sum() / total_nlst*100,1)]]\n",
+    "print(tabulate(table_lung_fh))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "------------------------------------  -----  ------  -----  ------\n",
+      "Body Mass Index                       PLCO   PLCO %  NLST   NLST %\n",
+      "Underweight (... <= 18.4)             295    0.5     347    0.7\n",
+      "Healthy weight (18.5 <= ... <= 24.9)  17556  31.8    13404  27.6\n",
+      "Overweight (25 <= ... <= 29.9)        23920  43.4    20894  43.0\n",
+      "Obesity (... >= 30)                   12631  22.9    13696  28.2\n",
+      "Missing                               759    1.4     234    0.5\n",
+      "------------------------------------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_bmi = [['Body Mass Index', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['Underweight (... <= 18.4)', plco[plco['bmi']<18.5]['bmi'].count(), round(plco[plco['bmi']<18.4]['bmi'].count() / total_plco * 100,1), nlst[nlst['bmi']<18.4]['bmi'].count(), round(nlst[nlst['bmi']<18.4]['bmi'].count() / total_nlst * 100,1)],\n",
+    "                ['Healthy weight (18.5 <= ... <= 24.9)',plco[(plco['bmi']>=18.5) & (plco['bmi']<25)]['bmi'].count(),  round(plco[(plco['bmi']>=18.5) & (plco['bmi']<25)]['bmi'].count()/ total_plco * 100,1), nlst[(nlst['bmi']>=18.5) & (nlst['bmi']<25)]['bmi'].count(), round(nlst[(nlst['bmi']>=18.5) & (nlst['bmi']<25)]['bmi'].count() / total_nlst * 100,1)],\n",
+    "                ['Overweight (25 <= ... <= 29.9)',plco[(plco['bmi']>=25) & (plco['bmi']<30)]['bmi'].count(), round(plco[(plco['bmi']>=25) & (plco['bmi']<30)]['bmi'].count() / total_plco * 100,1), nlst[(nlst['bmi']>=25) & (nlst['bmi']<30)]['bmi'].count(),round(nlst[(nlst['bmi']>=25) & (nlst['bmi']<30)]['bmi'].count() / total_nlst * 100,1)],\n",
+    "                ['Obesity (... >= 30)',plco[(plco['bmi']>=30)]['bmi'].count(), round(plco[(plco['bmi']>=30)]['bmi'].count() / total_plco * 100,1), nlst[(nlst['bmi']>=30)]['bmi'].count(), round(nlst[(nlst['bmi']>=30)]['bmi'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing',plco['bmi'].isna().sum(), round(plco['bmi'].isna().sum() / total_plco * 100,1), nlst['bmi'].isna().sum(), round(nlst['bmi'].isna().sum() / total_nlst * 100,1)]]            \n",
+    "print(tabulate(table_bmi))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-----------  -----  ------  -----  ------\n",
+      "Lung cancer  PLCO   PLCO %  NLST   NLST %\n",
+      "Negative     52409  95.0    47084  96.9\n",
+      "Positive     2752   5.0     1511   3.1\n",
+      "Missing      0      0.0     0      0.0\n",
+      "-----------  -----  ------  -----  ------\n"
+     ]
+    }
+   ],
+   "source": [
+    "table_lung_cancer = [['Lung cancer', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n",
+    "                ['Negative', plco[plco['lung_cancer']==0]['lung_cancer'].count(),round(plco[plco['lung_cancer']==0]['lung_cancer'].count() / total_plco * 100,1), nlst[nlst['lung_cancer']==0]['lung_cancer'].count(), round(nlst[nlst['lung_cancer']==0]['lung_cancer'].count() / total_nlst * 100,1)],\n",
+    "                ['Positive', plco[plco['lung_cancer']==1]['lung_cancer'].count(),round(plco[plco['lung_cancer']==1]['lung_cancer'].count() / total_plco * 100,1), nlst[nlst['lung_cancer']==1]['lung_cancer'].count(), round(nlst[nlst['lung_cancer']==1]['lung_cancer'].count() / total_nlst * 100,1)],\n",
+    "                ['Missing', plco['lung_cancer'].isna().sum(), round(plco['lung_cancer'].isna().sum()/total_plco*100,1), nlst['lung_cancer'].isna().sum(), round(nlst['lung_cancer'].isna().sum() / total_nlst*100,1)]]\n",
+    "print(tabulate(table_lung_cancer))"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Saving a txt file\n",
+    "\n",
+    "Now we write a text file to concatenate these analyses. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File edited\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('./data_analysis.txt', 'w') as f:\n",
+    "    f.write('------------ PRE-PROCESSED DATA ANALYSIS ------------ \\n \\n')\n",
+    "    f.write('We perform data analysis on each features of the PLCO and NLST dataset.\\n')\n",
+    "    f.write('Number of participants: \\n')\n",
+    "    f.write('  - PLCO: ' + str(total_plco) + '\\n')\n",
+    "    f.write('  - NLST: ' + str(total_nlst) + '\\n \\n')\n",
+    "    f.write('--- Feature analysis --- \\n\\n')\n",
+    "    f.write('Age: This feature captures the person’s age. \\n')\n",
+    "    f.write(tabulate(table_age))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('Smoking cessation age: This feature describes the age at which the person stopped smoking. \\n')\n",
+    "    f.write(tabulate(table_ssmokea_f))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('Smoking status: This feature describes if the person is a current or a former cigarette smoker at the beginning of the study. \\n')\n",
+    "    f.write(tabulate(table_cig_stat))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('Pack-years: This feature refers to the number of packs smoked per day multiplied by the number of years during which the person smoked. \\n')\n",
+    "    f.write(tabulate(table_pack_years))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('Smoking onset age: This feature indicates the age at which the person started smoking. \\n')\n",
+    "    f.write(tabulate(table_smokea_f))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('Years smoked: This feature describes the total number of years during which the person smoked. \\n')\n",
+    "    f.write(tabulate(table_cig_years))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('Lung family history: This feature describes if the person has close family (parents, siblings or child) who had lung cancer. \\n')\n",
+    "    f.write(tabulate(table_lung_fh))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('BMI: This feature describes the person’s body mass index. \\n')\n",
+    "    f.write(tabulate(table_bmi))\n",
+    "    f.write('\\n\\n')\n",
+    "    f.write('Lung cancer: This feature indicates if the person was diagnosed with lung cancer. \\n')\n",
+    "    f.write(tabulate(table_lung_cancer))\n",
+    "    f.write('\\n\\n\\n')\n",
+    "print(\"File edited\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}