Switch to side-by-side view

--- a
+++ b/notebooks/USPSTF_recommendations.ipynb
@@ -0,0 +1,250 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# USPSTF recommendations notebook\n",
+    "\n",
+    "P. Benveniste $^1$, J. Alberge $^1$\n",
+    "\n",
+    "$^1$ Ecole Normale Supérieure Paris-Saclay\n",
+    "\n",
+    "In this Notebook, we look at the results of the USPSTF recommendations on PLCO and NLST. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Import of the librairies\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "from tabulate import tabulate"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We now import both datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(55161, 10)\n",
+      "(48595, 10)\n"
+     ]
+    }
+   ],
+   "source": [
+    "#Loading of both datasets\n",
+    "plco_file = './preprocessed_plco.csv'\n",
+    "plco = pd.read_csv(plco_file)\n",
+    "nlst_file = './preprocessed_nlst.csv'\n",
+    "nlst = pd.read_csv(nlst_file)\n",
+    "\n",
+    "total_plco = len(plco)\n",
+    "print(plco.shape)\n",
+    "total_nlst = len(nlst)\n",
+    "print(nlst.shape)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### US RECOMMENDATION TOOL\n",
+    "\n",
+    "Now we look into the USPSTF recommendation tool on PLCO and NLST."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pre-processed PLCO size: 55161\n",
+      "Pre-processed PLCO with lung cancer: 2752\n",
+      "Patients from PLCO who fit into US recommendation: 22609\n",
+      "Patients from PLCO who fit into US recommendation with lung cancer: 2105\n",
+      "------- USPSTF RECOMMENDATION ON PLCO --------\n",
+      "TP :  2105\n",
+      "FN :  647\n",
+      "TN :  31905\n",
+      "FP :  20504\n",
+      "Precision :  0.093\n",
+      "Recall :  0.765\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Pre-processed PLCO size:\", len(plco))\n",
+    "print(\"Pre-processed PLCO with lung cancer:\", len(plco[plco[\"lung_cancer\"]==1]))\n",
+    "\n",
+    "plco_criteria = plco.copy()\n",
+    "plco_criteria = plco_criteria[plco_criteria[\"age\"]>=50]\n",
+    "plco_criteria = plco_criteria[plco_criteria[\"age\"]<=80]\n",
+    "plco_criteria = plco_criteria[plco_criteria[\"pack_years\"]>=20]\n",
+    "plco_criteria = plco_criteria[ (plco_criteria[\"cig_stat\"]==1) | (plco_criteria[\"age\"] - plco_criteria[\"ssmokea_f\"] <=15) ]\n",
+    "\n",
+    "print(\"Patients from PLCO who fit into US recommendation:\", len(plco_criteria))\n",
+    "print(\"Patients from PLCO who fit into US recommendation with lung cancer:\", len(plco_criteria[plco_criteria[\"lung_cancer\"]==1]))\n",
+    "\n",
+    "TP_plco = len(plco_criteria[plco_criteria[\"lung_cancer\"]==1])\n",
+    "FN_plco = len(plco[plco[\"lung_cancer\"]==1])-TP_plco\n",
+    "TN_plco = len(plco[plco[\"lung_cancer\"]==0]) - len(plco_criteria[plco_criteria[\"lung_cancer\"]==0])\n",
+    "FP_plco = len(plco_criteria[plco_criteria[\"lung_cancer\"]==0])\n",
+    "\n",
+    "print(\"------- USPSTF RECOMMENDATION ON PLCO --------\")\n",
+    "print(\"TP : \", TP_plco)\n",
+    "print(\"FN : \", FN_plco)\n",
+    "print(\"TN : \", TN_plco)\n",
+    "print(\"FP : \", FP_plco)\n",
+    "print(\"Precision : \",  round(TP_plco/(TP_plco+FP_plco),3))\n",
+    "print(\"Recall : \", round(TP_plco/(TP_plco+FN_plco),3) )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Pre-processed NLST size: 48595\n",
+      "Pre-processed NLST with cancer: 1511\n",
+      "Patients from NLST who fit into US recommendation: 48034\n",
+      "Patients from NLST who fit into US recommendation with cancer: 1495\n",
+      "------- USPSTF RECOMMENDATION ON NLST --------\n",
+      "TP :  1495\n",
+      "FN :  16\n",
+      "TN :  545\n",
+      "FP :  46539\n",
+      "Precision :  0.031\n",
+      "Recall :  0.989\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Pre-processed NLST size:\", len(nlst))\n",
+    "print(\"Pre-processed NLST with cancer:\", len(nlst[nlst[\"lung_cancer\"]==1]))\n",
+    "\n",
+    "nlst_criteria = nlst.copy()\n",
+    "nlst_criteria = nlst_criteria[nlst_criteria[\"age\"]>=50]\n",
+    "nlst_criteria = nlst_criteria[nlst_criteria[\"age\"]<=80]\n",
+    "nlst_criteria = nlst_criteria[nlst_criteria[\"pack_years\"]>=20]\n",
+    "nlst_criteria = nlst_criteria[ (nlst_criteria[\"cig_stat\"]==1) | (nlst_criteria[\"age\"] - nlst_criteria[\"ssmokea_f\"] <=15) ]\n",
+    "\n",
+    "print(\"Patients from NLST who fit into US recommendation:\", len(nlst_criteria))\n",
+    "print(\"Patients from NLST who fit into US recommendation with cancer:\", len(nlst_criteria[nlst_criteria[\"lung_cancer\"]==1]))\n",
+    "\n",
+    "TP_nlst = len(nlst_criteria[nlst_criteria[\"lung_cancer\"]==1])\n",
+    "FN_nlst = len(nlst[nlst[\"lung_cancer\"]==1])-TP_nlst\n",
+    "TN_nlst = len(nlst[nlst[\"lung_cancer\"]==0]) - len(nlst_criteria[nlst_criteria[\"lung_cancer\"]==0])\n",
+    "FP_nlst = len(nlst_criteria[nlst_criteria[\"lung_cancer\"]==0])\n",
+    "\n",
+    "print(\"------- USPSTF RECOMMENDATION ON NLST --------\")\n",
+    "print(\"TP : \", TP_nlst)\n",
+    "print(\"FN : \", FN_nlst)\n",
+    "print(\"TN : \", TN_nlst)\n",
+    "print(\"FP : \", FP_nlst)\n",
+    "print(\"Precision : \",  round(TP_nlst/(TP_nlst+FP_nlst),3))\n",
+    "print(\"Recall : \", round(TP_nlst/(TP_nlst+FN_nlst),3) )"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Saving a txt file\n",
+    "\n",
+    "Now we write a text file to concatenate these analyses. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File edited\n"
+     ]
+    }
+   ],
+   "source": [
+    "with open('./USPSTF_recommendations.txt', 'w') as f:\n",
+    "    f.write('------------ COMPARISON WITH USPSTF ON PLCO------------ \\n \\n')\n",
+    "    f.write(\"Pre-processed PLCO size: \" +str(len(plco)) + '\\n')\n",
+    "    f.write(\"Pre-processed PLCO with lung cancer: \" + str(len(plco[plco[\"lung_cancer\"]==1])) + '\\n')\n",
+    "    f.write(\"Patients from PLCO who fit into US recommendation: \"+ str(len(plco_criteria))+ '\\n')\n",
+    "    f.write(\"Patients from PLCO who fit into US recommendation with lung cancer: \"+ str(len(plco_criteria[plco_criteria[\"lung_cancer\"]==1])) + '\\n\\n')\n",
+    "    f.write(\"------- USPSTF RECOMMENDATION ON PLCO -------- \\n\")\n",
+    "    f.write(\"TP : \" + str(TP_plco) + '\\n')\n",
+    "    f.write(\"FN : \" + str(FN_plco) + '\\n')\n",
+    "    f.write(\"TN : \" + str(TN_plco) + '\\n')\n",
+    "    f.write(\"FP : \" + str(FP_plco) + '\\n')\n",
+    "    f.write(\"Precision : \" +  str(round(TP_plco/(TP_plco+FP_plco),3)) + '\\n')\n",
+    "    f.write(\"Recall : \" + str(round(TP_plco/(TP_plco+FN_plco),3)) + '\\n\\n\\n')\n",
+    "    f.write('------------ COMPARISON WITH USPSTF ON NLST------------ \\n \\n')\n",
+    "    f.write(\"Pre-processed NLST size: \" +str(len(nlst)) + '\\n')\n",
+    "    f.write(\"Pre-processed NLST with lung cancer: \" + str(len(nlst[nlst[\"lung_cancer\"]==1])) + '\\n')\n",
+    "    f.write(\"Patients from NLST who fit into US recommendation: \"+ str(len(nlst_criteria))+ '\\n')\n",
+    "    f.write(\"Patients from NLST who fit into US recommendation with lung cancer: \"+ str(len(nlst_criteria[nlst_criteria[\"lung_cancer\"]==1])) + '\\n\\n')\n",
+    "    f.write(\"------- USPSTF RECOMMENDATION ON NLST -------- \\n\")\n",
+    "    f.write(\"TP : \" + str(TP_nlst) + '\\n')\n",
+    "    f.write(\"FN : \" + str(FN_nlst) + '\\n')\n",
+    "    f.write(\"TN : \" + str(TN_nlst) + '\\n')\n",
+    "    f.write(\"FP : \" + str(FP_nlst) + '\\n')\n",
+    "    f.write(\"Precision : \" +  str(round(TP_nlst/(TP_nlst+FP_nlst),3)) + '\\n')\n",
+    "    f.write(\"Recall : \" + str(round(TP_nlst/(TP_nlst+FN_nlst),3)) + '\\n\\n\\n')\n",
+    "print(\"File edited\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}