--- a +++ b/notebooks/Data_analysis.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Data Analysis notebook\n", + "\n", + "P. Benveniste $^1$, J. Alberge $^1$\n", + "\n", + "$^1$ Ecole Normale Supérieure Paris-Saclay\n", + "\n", + "In this Notebook, we perform the analysis of the final datasets after preprocessing and feature extraction." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#Import of the librairies\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "from tabulate import tabulate" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We now import both datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(55161, 10)\n", + "(48595, 10)\n" + ] + } + ], + "source": [ + "#Loading of both datasets\n", + "plco_file = './preprocessed_plco.csv'\n", + "plco = pd.read_csv(plco_file)\n", + "nlst_file = './preprocessed_nlst.csv'\n", + "nlst = pd.read_csv(nlst_file)\n", + "\n", + "total_plco = len(plco)\n", + "print(plco.shape)\n", + "total_nlst = len(nlst)\n", + "print(nlst.shape)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we perform data analysis for each of the following features:\n", + "- `age`: This feature captures the person’s age.\n", + "- `ssmokea_f`: This feature describes the age at which the person stopped smoking.\n", + "- `cig_stat`: This feature describes if the person is a current or a former cigarette smoker at the beginning of the study.\n", + "- `pack_years`: This feature refers to the number of packs smoked per day multiplied by the number of years during which the person smoked.\n", + "- `smokea_f`: This feature indicates the age at which the person started smoking.\n", + "- `cig_years`: This feature describes the total number of years during which the person smoked. \n", + "- `lung_fh`: This feature describes if the person has close family (parents, siblings or child) who had lung cancer.\n", + "- `bmi`: This feature describes the person’s body mass index\n", + "- `lung_cancer`: This feature indicates if the person was diagnosed with lung cancer." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------- ----- ------ ----- ------\n", + "Age PLCO PLCO % NLST NLST %\n", + "<= 50 0 0.0 1 0.0\n", + "50 < ... <= 60 27337 49.6 24861 51.2\n", + "60 < ... <= 70 25120 45.5 20901 43.0\n", + "> 70 2704 4.9 2832 5.8\n", + "Missing 0 0.0 0 0.0\n", + "-------------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_age = [['Age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['<= 50', plco[plco['age']<51]['age'].count(), round(plco[plco['age']<51]['age'].count() / total_plco * 100,1), nlst[nlst['age']<51]['age'].count(), round(nlst[nlst['age']<51]['age'].count() / total_nlst * 100,1)],\n", + " ['50 < ... <= 60',plco[(plco['age']>=51) & (plco['age']<61)]['age'].count(), round(plco[(plco['age']>=51) & (plco['age']<61)]['age'].count()/ total_plco * 100,1), nlst[(nlst['age']>=51) & (nlst['age']<61)]['age'].count(), round(nlst[(nlst['age']>=51) & (nlst['age']<61)]['age'].count() / total_nlst * 100,1)],\n", + " ['60 < ... <= 70',plco[(plco['age']>=61) & (plco['age']<71)]['age'].count(), round(plco[(plco['age']>=61) & (plco['age']<71)]['age'].count() / total_plco * 100,1), nlst[(nlst['age']>=61) & (nlst['age']<71)]['age'].count(), round(nlst[(nlst['age']>=61) & (nlst['age']<71)]['age'].count() / total_nlst * 100,1)],\n", + " ['> 70',plco[(plco['age']>=71)]['age'].count(), round(plco[(plco['age']>=71)]['age'].count() / total_plco * 100,1), nlst[(nlst['age']>=71)]['age'].count(), round(nlst[(nlst['age']>=71)]['age'].count() / total_nlst * 100,1)],\n", + " ['Missing',plco['age'].isna().sum(), round(plco['age'].isna().sum() / total_plco * 100,1), nlst['age'].isna().sum(), round(nlst['age'].isna().sum() / total_nlst * 100,1)]] \n", + "print(tabulate(table_age))" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------- ----- ------ ----- ------\n", + "Smoking cessation age PLCO PLCO % NLST NLST %\n", + "<= 30 10470 19.0 2 0.0\n", + "30 < ... <= 40 11886 21.5 130 0.3\n", + "40 < ... <= 50 11447 20.8 7025 14.5\n", + "50 < ... <= 60 8649 15.7 14071 29.0\n", + "> 60 1942 3.5 4378 9.0\n", + "Missing 10767 19.5 22989 47.3\n", + "--------------------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_ssmokea_f = [['Smoking cessation age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['<= 30', plco[plco['ssmokea_f']<31]['ssmokea_f'].count(), round(plco[plco['ssmokea_f']<31]['ssmokea_f'].count() / total_plco * 100,1), nlst[nlst['ssmokea_f']<31]['ssmokea_f'].count(), round(nlst[nlst['ssmokea_f']<31]['ssmokea_f'].count() / total_nlst * 100,1)],\n", + " ['30 < ... <= 40',plco[(plco['ssmokea_f']>=31) & (plco['ssmokea_f']<41)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=31) & (plco['ssmokea_f']<41)]['ssmokea_f'].count()/ total_plco * 100,1), nlst[(nlst['ssmokea_f']>=31) & (nlst['ssmokea_f']<41)]['ssmokea_f'].count(), round(nlst[(nlst['ssmokea_f']>=31) & (nlst['ssmokea_f']<41)]['ssmokea_f'].count() / total_nlst * 100,1)],\n", + " ['40 < ... <= 50',plco[(plco['ssmokea_f']>=41) & (plco['ssmokea_f']<51)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=41) & (plco['ssmokea_f']<51)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=41) & (nlst['ssmokea_f']<51)]['ssmokea_f'].count(),round(nlst[(nlst['ssmokea_f']>=41) & (nlst['ssmokea_f']<51)]['ssmokea_f'].count() / total_nlst * 100,1)],\n", + " ['50 < ... <= 60',plco[(plco['ssmokea_f']>=51) & (plco['ssmokea_f']<61)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=51) & (plco['ssmokea_f']<61)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=51) & (nlst['ssmokea_f']<61)]['ssmokea_f'].count(),round(nlst[(nlst['ssmokea_f']>=51) & (nlst['ssmokea_f']<61)]['ssmokea_f'].count() / total_nlst * 100,1)],\n", + " ['> 60',plco[(plco['ssmokea_f']>=61)]['ssmokea_f'].count(), round(plco[(plco['ssmokea_f']>=61)]['ssmokea_f'].count() / total_plco * 100,1), nlst[(nlst['ssmokea_f']>=61)]['ssmokea_f'].count(), round(nlst[(nlst['ssmokea_f']>=61)]['ssmokea_f'].count() / total_nlst * 100,1)],\n", + " ['Missing',plco['ssmokea_f'].isna().sum(), round(plco['ssmokea_f'].isna().sum() / total_plco * 100,1), nlst['ssmokea_f'].isna().sum(), round(nlst['ssmokea_f'].isna().sum() / total_nlst * 100,1)]] \n", + "print(tabulate(table_ssmokea_f))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------- ----- ------ ----- ------\n", + "Smoking status PLCO PLCO % NLST NLST %\n", + "Active 9965 18.1 22842 47.0\n", + "Former 45196 81.9 25753 53.0\n", + "Missing 0 0.0 0 0.0\n", + "-------------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_cig_stat = [['Smoking status', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['Active', plco[plco['cig_stat']==1]['cig_stat'].count(),round(plco[plco['cig_stat']==1]['cig_stat'].count() / total_plco * 100,1), nlst[nlst['cig_stat']==1]['cig_stat'].count(), round(nlst[nlst['cig_stat']==1]['cig_stat'].count() / total_nlst * 100,1)],\n", + " ['Former', plco[plco['cig_stat']==2]['cig_stat'].count(),round(plco[plco['cig_stat']==2]['cig_stat'].count() / total_plco * 100,1), nlst[nlst['cig_stat']==2]['cig_stat'].count(), round(nlst[nlst['cig_stat']==2]['cig_stat'].count() / total_nlst * 100,1)],\n", + " ['Missing', plco['cig_stat'].isna().sum(), round(plco['cig_stat'].isna().sum()/total_plco*100,1), nlst['cig_stat'].isna().sum(), round(nlst['cig_stat'].isna().sum() / total_nlst*100,1)]]\n", + " \n", + "print(tabulate(table_cig_stat))" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------- ----- ------ ----- ------\n", + "Pack years PLCO PLCO % NLST NLST %\n", + "<= 25 26981 48.9 8 0.0\n", + "25 < ... <= 50 16147 29.3 26746 55.0\n", + "50 < ... <= 100 9448 17.1 19544 40.2\n", + "> 100 1434 2.6 2297 4.7\n", + "Missing 1151 2.1 0 0.0\n", + "--------------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_pack_years = [['Pack years', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['<= 25', plco[plco['pack_years']<26]['pack_years'].count(), round(plco[plco['pack_years']<26]['pack_years'].count() / total_plco * 100,1), nlst[nlst['pack_years']<26]['pack_years'].count(), round(nlst[nlst['pack_years']<26]['pack_years'].count() / total_nlst * 100,1)],\n", + " ['25 < ... <= 50',plco[(plco['pack_years']>=26) & (plco['pack_years']<51)]['pack_years'].count(), round(plco[(plco['pack_years']>=26) & (plco['pack_years']<51)]['pack_years'].count() / total_plco * 100,1), nlst[(nlst['pack_years']>=26) & (nlst['pack_years']<51)]['pack_years'].count(),round(nlst[(nlst['pack_years']>=26) & (nlst['pack_years']<51)]['pack_years'].count() / total_nlst * 100,1)],\n", + " ['50 < ... <= 100',plco[(plco['pack_years']>=51) & (plco['pack_years']<101)]['pack_years'].count(), round(plco[(plco['pack_years']>=51) & (plco['pack_years']<101)]['pack_years'].count()/ total_plco * 100,1), nlst[(nlst['pack_years']>=51) & (nlst['pack_years']<101)]['pack_years'].count(), round(nlst[(nlst['pack_years']>=51) & (nlst['pack_years']<101)]['pack_years'].count() / total_nlst * 100,1)],\n", + " ['> 100',plco[(plco['pack_years']>=101)]['pack_years'].count(), round(plco[(plco['pack_years']>=101)]['pack_years'].count() / total_plco * 100,1), nlst[(nlst['pack_years']>=101)]['pack_years'].count(), round(nlst[(nlst['pack_years']>=101)]['pack_years'].count() / total_nlst * 100,1)],\n", + " ['Missing',plco['pack_years'].isna().sum(), round(plco['pack_years'].isna().sum() / total_plco * 100,1), nlst['pack_years'].isna().sum(), round(nlst['pack_years'].isna().sum() / total_nlst * 100,1)]] \n", + "print(tabulate(table_pack_years))" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------------- ----- ------ ----- ------\n", + "Smoking onset age PLCO PLCO % NLST NLST %\n", + "<= 15 10169 18.4 17927 36.9\n", + "15 < ... <= 20 33760 61.2 25411 52.3\n", + "> 20 10950 19.9 5256 10.8\n", + "Missing 282 0.5 1 0.0\n", + "----------------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_smokea_f = [['Smoking onset age', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['<= 15', plco[plco['smokea_f']<16]['smokea_f'].count(), round(plco[plco['smokea_f']<16]['smokea_f'].count() / total_plco * 100,1), nlst[nlst['smokea_f']<16]['smokea_f'].count(), round(nlst[nlst['smokea_f']<16]['smokea_f'].count() / total_nlst * 100,1)],\n", + " ['15 < ... <= 20',plco[(plco['smokea_f']>=16) & (plco['smokea_f']<21)]['smokea_f'].count(), round(plco[(plco['smokea_f']>=16) & (plco['smokea_f']<21)]['smokea_f'].count()/ total_plco * 100,1), nlst[(nlst['smokea_f']>=16) & (nlst['smokea_f']<21)]['smokea_f'].count(), round(nlst[(nlst['smokea_f']>=16) & (nlst['smokea_f']<21)]['smokea_f'].count() / total_nlst * 100,1)],\n", + " ['> 20',plco[(plco['smokea_f']>=21)]['smokea_f'].count(), round(plco[(plco['smokea_f']>=21)]['smokea_f'].count() / total_plco * 100,1), nlst[(nlst['smokea_f']>=21)]['smokea_f'].count(), round(nlst[(nlst['smokea_f']>=21)]['smokea_f'].count() / total_nlst * 100,1)],\n", + " ['Missing',plco['smokea_f'].isna().sum(), round(plco['smokea_f'].isna().sum() / total_plco * 100,1), nlst['smokea_f'].isna().sum(), round(nlst['smokea_f'].isna().sum() / total_nlst * 100,1)]] \n", + "print(tabulate(table_smokea_f))" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------- ----- ------ ----- ------\n", + "Smoking years PLCO PLCO % NLST NLST %\n", + "<= 10 8800 16.0 2 0.0\n", + "10 < ... <= 20 11761 21.3 292 0.6\n", + "20 < ... <= 30 11532 20.9 5134 10.6\n", + "30 < ... <= 40 13037 23.6 21620 44.5\n", + "> 40 8963 16.2 21547 44.3\n", + "Missing 1068 1.9 0 0.0\n", + "-------------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_cig_years = [['Smoking years', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['<= 10', plco[plco['cig_years']<11]['cig_years'].count(), round(plco[plco['cig_years']<11]['cig_years'].count() / total_plco * 100,1), nlst[nlst['cig_years']<11]['cig_years'].count(), round(nlst[nlst['cig_years']<11]['cig_years'].count() / total_nlst * 100,1)],\n", + " ['10 < ... <= 20',plco[(plco['cig_years']>=11) & (plco['cig_years']<21)]['cig_years'].count(), round(plco[(plco['cig_years']>=11) & (plco['cig_years']<21)]['cig_years'].count()/ total_plco * 100,1), nlst[(nlst['cig_years']>=11) & (nlst['cig_years']<21)]['cig_years'].count(), round(nlst[(nlst['cig_years']>=11) & (nlst['cig_years']<21)]['cig_years'].count() / total_nlst * 100,1)],\n", + " ['20 < ... <= 30',plco[(plco['cig_years']>=21) & (plco['cig_years']<31)]['cig_years'].count(), round(plco[(plco['cig_years']>=21) & (plco['cig_years']<31)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=21) & (nlst['cig_years']<31)]['cig_years'].count(),round(nlst[(nlst['cig_years']>=21) & (nlst['cig_years']<31)]['cig_years'].count() / total_nlst * 100,1)],\n", + " ['30 < ... <= 40',plco[(plco['cig_years']>=31) & (plco['cig_years']<41)]['cig_years'].count(), round(plco[(plco['cig_years']>=31) & (plco['cig_years']<41)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=31) & (nlst['cig_years']<41)]['cig_years'].count(),round(nlst[(nlst['cig_years']>=31) & (nlst['cig_years']<41)]['cig_years'].count() / total_nlst * 100,1)],\n", + " ['> 40',plco[(plco['cig_years']>=41)]['cig_years'].count(), round(plco[(plco['cig_years']>=41)]['cig_years'].count() / total_plco * 100,1), nlst[(nlst['cig_years']>=41)]['cig_years'].count(), round(nlst[(nlst['cig_years']>=41)]['cig_years'].count() / total_nlst * 100,1)],\n", + " ['Missing',plco['cig_years'].isna().sum(), round(plco['cig_years'].isna().sum() / total_plco * 100,1), nlst['cig_years'].isna().sum(), round(nlst['cig_years'].isna().sum() / total_nlst * 100,1)]] \n", + "print(tabulate(table_cig_years))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-------------------------- ----- ------ ----- ------\n", + "Lung cancer family history PLCO PLCO % NLST NLST %\n", + "No 48415 87.8 37302 76.8\n", + "Yes 6323 11.5 10598 21.8\n", + "Missing 423 0.8 695 1.4\n", + "-------------------------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_lung_fh = [['Lung cancer family history', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['No', plco[plco['lung_fh']==0]['lung_fh'].count(),round(plco[plco['lung_fh']==0]['lung_fh'].count() / total_plco * 100,1), nlst[nlst['lung_fh']==0]['lung_fh'].count(), round(nlst[nlst['lung_fh']==0]['lung_fh'].count() / total_nlst * 100,1)],\n", + " ['Yes', plco[plco['lung_fh']==1]['lung_fh'].count(),round(plco[plco['lung_fh']==1]['lung_fh'].count() / total_plco * 100,1), nlst[nlst['lung_fh']==1]['lung_fh'].count(), round(nlst[nlst['lung_fh']==1]['lung_fh'].count() / total_nlst * 100,1)],\n", + " ['Missing', plco['lung_fh'].isna().sum(), round(plco['lung_fh'].isna().sum()/total_plco*100,1), nlst['lung_fh'].isna().sum(), round(nlst['lung_fh'].isna().sum() / total_nlst*100,1)]]\n", + "print(tabulate(table_lung_fh))" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "------------------------------------ ----- ------ ----- ------\n", + "Body Mass Index PLCO PLCO % NLST NLST %\n", + "Underweight (... <= 18.4) 295 0.5 347 0.7\n", + "Healthy weight (18.5 <= ... <= 24.9) 17556 31.8 13404 27.6\n", + "Overweight (25 <= ... <= 29.9) 23920 43.4 20894 43.0\n", + "Obesity (... >= 30) 12631 22.9 13696 28.2\n", + "Missing 759 1.4 234 0.5\n", + "------------------------------------ ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_bmi = [['Body Mass Index', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['Underweight (... <= 18.4)', plco[plco['bmi']<18.5]['bmi'].count(), round(plco[plco['bmi']<18.4]['bmi'].count() / total_plco * 100,1), nlst[nlst['bmi']<18.4]['bmi'].count(), round(nlst[nlst['bmi']<18.4]['bmi'].count() / total_nlst * 100,1)],\n", + " ['Healthy weight (18.5 <= ... <= 24.9)',plco[(plco['bmi']>=18.5) & (plco['bmi']<25)]['bmi'].count(), round(plco[(plco['bmi']>=18.5) & (plco['bmi']<25)]['bmi'].count()/ total_plco * 100,1), nlst[(nlst['bmi']>=18.5) & (nlst['bmi']<25)]['bmi'].count(), round(nlst[(nlst['bmi']>=18.5) & (nlst['bmi']<25)]['bmi'].count() / total_nlst * 100,1)],\n", + " ['Overweight (25 <= ... <= 29.9)',plco[(plco['bmi']>=25) & (plco['bmi']<30)]['bmi'].count(), round(plco[(plco['bmi']>=25) & (plco['bmi']<30)]['bmi'].count() / total_plco * 100,1), nlst[(nlst['bmi']>=25) & (nlst['bmi']<30)]['bmi'].count(),round(nlst[(nlst['bmi']>=25) & (nlst['bmi']<30)]['bmi'].count() / total_nlst * 100,1)],\n", + " ['Obesity (... >= 30)',plco[(plco['bmi']>=30)]['bmi'].count(), round(plco[(plco['bmi']>=30)]['bmi'].count() / total_plco * 100,1), nlst[(nlst['bmi']>=30)]['bmi'].count(), round(nlst[(nlst['bmi']>=30)]['bmi'].count() / total_nlst * 100,1)],\n", + " ['Missing',plco['bmi'].isna().sum(), round(plco['bmi'].isna().sum() / total_plco * 100,1), nlst['bmi'].isna().sum(), round(nlst['bmi'].isna().sum() / total_nlst * 100,1)]] \n", + "print(tabulate(table_bmi))" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "----------- ----- ------ ----- ------\n", + "Lung cancer PLCO PLCO % NLST NLST %\n", + "Negative 52409 95.0 47084 96.9\n", + "Positive 2752 5.0 1511 3.1\n", + "Missing 0 0.0 0 0.0\n", + "----------- ----- ------ ----- ------\n" + ] + } + ], + "source": [ + "table_lung_cancer = [['Lung cancer', 'PLCO', 'PLCO %', 'NLST', 'NLST %'],\n", + " ['Negative', plco[plco['lung_cancer']==0]['lung_cancer'].count(),round(plco[plco['lung_cancer']==0]['lung_cancer'].count() / total_plco * 100,1), nlst[nlst['lung_cancer']==0]['lung_cancer'].count(), round(nlst[nlst['lung_cancer']==0]['lung_cancer'].count() / total_nlst * 100,1)],\n", + " ['Positive', plco[plco['lung_cancer']==1]['lung_cancer'].count(),round(plco[plco['lung_cancer']==1]['lung_cancer'].count() / total_plco * 100,1), nlst[nlst['lung_cancer']==1]['lung_cancer'].count(), round(nlst[nlst['lung_cancer']==1]['lung_cancer'].count() / total_nlst * 100,1)],\n", + " ['Missing', plco['lung_cancer'].isna().sum(), round(plco['lung_cancer'].isna().sum()/total_plco*100,1), nlst['lung_cancer'].isna().sum(), round(nlst['lung_cancer'].isna().sum() / total_nlst*100,1)]]\n", + "print(tabulate(table_lung_cancer))" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving a txt file\n", + "\n", + "Now we write a text file to concatenate these analyses. " + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File edited\n" + ] + } + ], + "source": [ + "with open('./data_analysis.txt', 'w') as f:\n", + " f.write('------------ PRE-PROCESSED DATA ANALYSIS ------------ \\n \\n')\n", + " f.write('We perform data analysis on each features of the PLCO and NLST dataset.\\n')\n", + " f.write('Number of participants: \\n')\n", + " f.write(' - PLCO: ' + str(total_plco) + '\\n')\n", + " f.write(' - NLST: ' + str(total_nlst) + '\\n \\n')\n", + " f.write('--- Feature analysis --- \\n\\n')\n", + " f.write('Age: This feature captures the person’s age. \\n')\n", + " f.write(tabulate(table_age))\n", + " f.write('\\n\\n')\n", + " f.write('Smoking cessation age: This feature describes the age at which the person stopped smoking. \\n')\n", + " f.write(tabulate(table_ssmokea_f))\n", + " f.write('\\n\\n')\n", + " f.write('Smoking status: This feature describes if the person is a current or a former cigarette smoker at the beginning of the study. \\n')\n", + " f.write(tabulate(table_cig_stat))\n", + " f.write('\\n\\n')\n", + " f.write('Pack-years: This feature refers to the number of packs smoked per day multiplied by the number of years during which the person smoked. \\n')\n", + " f.write(tabulate(table_pack_years))\n", + " f.write('\\n\\n')\n", + " f.write('Smoking onset age: This feature indicates the age at which the person started smoking. \\n')\n", + " f.write(tabulate(table_smokea_f))\n", + " f.write('\\n\\n')\n", + " f.write('Years smoked: This feature describes the total number of years during which the person smoked. \\n')\n", + " f.write(tabulate(table_cig_years))\n", + " f.write('\\n\\n')\n", + " f.write('Lung family history: This feature describes if the person has close family (parents, siblings or child) who had lung cancer. \\n')\n", + " f.write(tabulate(table_lung_fh))\n", + " f.write('\\n\\n')\n", + " f.write('BMI: This feature describes the person’s body mass index. \\n')\n", + " f.write(tabulate(table_bmi))\n", + " f.write('\\n\\n')\n", + " f.write('Lung cancer: This feature indicates if the person was diagnosed with lung cancer. \\n')\n", + " f.write(tabulate(table_lung_cancer))\n", + " f.write('\\n\\n\\n')\n", + "print(\"File edited\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.9" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}