Switch to side-by-side view

--- a
+++ b/experiments/Foresight Metrics MIMIC.ipynb
@@ -0,0 +1,695 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fba6b486",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Used for development\n",
+    "#import sys\n",
+    "#sys.path.insert(0, \"../foresight/\")\n",
+    "#import sys\n",
+    "#sys.path.insert(0, \"../MedCAT/\")\n",
+    "#%load_ext autoreload\n",
+    "#%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbbfb963",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import sys\n",
+    "import os\n",
+    "import pickle\n",
+    "import datasets\n",
+    "import numpy as np\n",
+    "from medcat.cat import CAT\n",
+    "from datetime import datetime"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82fa5c28",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DATASET = 'test'\n",
+    "DAYS = 1 # Do: 1, 14, 30\n",
+    "MAX_SEQ_LEN = 256\n",
+    "TYPES = ['ALL_TYPES']\n",
+    "\n",
+    "BASE_NAME = 'annotated_february_2022'\n",
+    "DATASET_NAME = 'annotations_stream_phase2_v1'\n",
+    "RUN_NAME = f'{DATASET_NAME}_{DAYS}d_{MAX_SEQ_LEN}_{\"_\".join(TYPES)}'\n",
+    "DATA_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{DATASET_NAME}.pickle\"\n",
+    "DATA_PATH_SPLITS = f\"./data/timecat/mimic/{BASE_NAME}/{DATASET_NAME}_split/\"\n",
+    "TOKENIZER_PATH = f\"./data/timecat/models/gpt/tokenizer_{RUN_NAME}.pickle\"\n",
+    "ALMOST_PREPARED_DATASET_SPLIT_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_almost_prepared_split/\"\n",
+    "PREPARED_DATASET_SPLIT_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_prepared_split/\"\n",
+    "JUST_BEFORE_ENCODING_DATASET_SPLIT_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_just_before_encoding/\"\n",
+    "CAT_PATH = \"./data/models/modelpacks/mc_modelpack_phase2_snomed_190k_february_2022.zip\"\n",
+    "PT_DOB_PATH = \"./data/mimic/pt2dob_datetime.pickle\"\n",
+    "PT_DOD_PATH = \"./data/mimic/pt2dod_timestamp.pickle\"\n",
+    "PT_SEX_PATH = \"./data/mimic/pt2sex.pickle\"\n",
+    "PT_LNS_PATH = f\"./data/timecat/mimic/{BASE_NAME}/lns_{DATASET_NAME}.pickle\"\n",
+    "PT_CNTS_PATH = f\"./data/timecat/mimic/{BASE_NAME}/cnts_{DATASET_NAME}.pickle\"\n",
+    "PT_ETHNICITY_PATH = \"./data/mimic/pt2ethnicity.pickle\"\n",
+    "TOKEN_TYPES_PATH = f'./data/timecat/mimic/{BASE_NAME}/types_{DATASET_NAME}.pickle'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "86a91f2f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_info = open(\"dataset-metrics-\" + DATASET + '-' + RUN_NAME + '.txt', 'w')\n",
+    "def fprint(*texts):\n",
+    "    for text in texts:\n",
+    "        print(text)\n",
+    "        ds_info.write(str(text) + \"\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e48922b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from foresight.metrics.next_concept_prediction import precision, metrics_data2df, ComputePrecisionHF\n",
+    "from foresight.tokenizers.simple_map_tokenizer import SimpleMapTokenizer\n",
+    "TOKENIZER_PATH = f\"/home/wish/data/timecat/models/gpt/tokenizer_{RUN_NAME}.pickle\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b17b753",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = [('Concept Type', 'Time (in days)', 'Top-K', 'Overall (MIMIC)', 'New (MIMIC)', 'Old (MIMIC)',\n",
+    "         'Recall All', 'Recall New', 'Recall Old')]\n",
+    "tmap = {'T-11': 'Disorders', 'T-55': 'Substances', 'T-18': 'Findings', 'all': \"All Concepts\", 'T-39': 'Procedures'}\n",
+    "for name in os.listdir(\"./metrics/\"):\n",
+    "    if name.startswith(\"start-0\"):\n",
+    "        m = pickle.load(open(\"./metrics/\" + name, 'rb'))\n",
+    "        p = name.split(\"_\")\n",
+    "        topk = p[1].split(\"-\")[1]\n",
+    "        time = int(p[3].split(\"-\")[1])\n",
+    "        time = int(time)\n",
+    "        types = p[4].split(\".\")[0].split(\"types-\")[1]\n",
+    "        types = tmap[types]\n",
+    "        data.append((types, time, topk, \n",
+    "                     \"{:.2f}\".format(m['precision']['all']), \n",
+    "                     \"{:.2f}\".format(m['precision']['new']), \n",
+    "                     \"{:.2f}\".format(m['precision']['old']),\n",
+    "                     \"{:.2f}\".format(m['recall']['all']),\n",
+    "                     \"{:.2f}\".format(m['recall']['new']),\n",
+    "                     \"{:.2f}\".format(m['recall']['old'])))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5668ed16",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [],
+   "source": [
+    "df = pd.DataFrame(data[1:], columns=data[0])\n",
+    "df['Top-K'] = [int(x) for x in df['Top-K'].values]\n",
+    "df = df.sort_values(by=['Concept Type', 'Time (in days)', 'Top-K'])\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6af1d523",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"./summary.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94065191",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = SimpleMapTokenizer.load(TOKENIZER_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37d6d42c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For the standard model get top 20 best performing concepts\n",
+    "m = pickle.load(open('./start-0_topk-1_time_range-30_types-all_types.pickle', 'rb'))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "328ab29b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new = metrics_data2df(m, tkn2name=tokenizer.tkn2name, temporality='new')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6cda4510",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df_new.head(n=20).to_csv(\"./top_20_cuis_new.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f2fa1496",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df_new.head(n=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "353b719e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new[df_new.negatives>100].tail(n=20).to_csv('bottom_20_cuis_with_min_100_negatives_new.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46671f82",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df_new[df_new.negatives>100].tail(n=20)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e9488a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_old = metrics_data2df(m, tkn2name=tokenizer.tkn2name, temporality='old')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "602b9d17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_old.head(n=20).to_csv(\"./top_20_cuis_old.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4065a3b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_old[df_old.negatives>100].tail(n=20).to_csv('bottom_20_cuis_with_min_100_negatives_old.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33e16c89",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_new[df_new.positives > 10]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "125dfbd6",
+   "metadata": {},
+   "source": [
+    "# Dataset Metrics"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5d6036ff",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "cat = CAT.load_model_pack(CAT_PATH, meta_cat_config_dict={'general': {'device': 'cpu'}})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a84ddda2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset = datasets.load_from_disk(JUST_BEFORE_ENCODING_DATASET_SPLIT_PATH)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "262b0aec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "26b62828",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "id2inds = {}\n",
+    "for ind, row in enumerate(dataset[DATASET]):\n",
+    "    if row['patient_id'] in id2inds:\n",
+    "        id2inds[row['patient_id']].append(ind)\n",
+    "    else:\n",
+    "        id2inds[row['patient_id']] = [ind]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7c4509e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = dataset[DATASET].to_dict()\n",
+    "for id in id2inds:\n",
+    "    inds = id2inds[id]\n",
+    "    if len(inds) > 1:\n",
+    "        for ind in inds[1:]:\n",
+    "            data['stream'][inds[0]].extend(data['stream'][ind])\n",
+    "            data['token_type'][inds[0]].extend(data['token_type'][ind])\n",
+    "            data['time'][inds[0]].extend(data['time'][ind])\n",
+    "            data['patient_id'][ind] = \"SKIP\"\n",
+    "dataset_combined = datasets.Dataset.from_dict(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb192cce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "timeline_lens = []\n",
+    "timeline_len_years = []\n",
+    "timeline_len_by_sex = {'Female': [], 'Male': [], 'Unknown': []}\n",
+    "timeline_len_by_sex_y = {'Female': [], 'Male': [], 'Unknown': []}\n",
+    "timeline_len_by_eth = {}\n",
+    "timeline_len_by_eth_y = {}\n",
+    "timeline_len_by_age = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}\n",
+    "timeline_len_by_age_y = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}\n",
+    "len_per_type = {}\n",
+    "sex = {'Female': 0, 'Male': 0, 'Unknown': 0}\n",
+    "age_groups = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}\n",
+    "ethnicity = {}\n",
+    "all_types = set([x for x in tokenizer.token_type2tokens.keys() if x.startswith('T-')])\n",
+    "\n",
+    "for e in dataset_combined:\n",
+    "    if e['patient_id'] == 'SKIP':\n",
+    "        continue\n",
+    "    \n",
+    "    t_len = len([x for x in e['token_type'] if x.startswith(\"T-\")])\n",
+    "    timeline_lens.append(t_len)\n",
+    "\n",
+    "    # Timeline in years\n",
+    "    l_years = (datetime.fromtimestamp(e['time'][-1]) - datetime.fromtimestamp(e['time'][0])).days / 365\n",
+    "    if l_years < 0:\n",
+    "        l_years = 0\n",
+    "    timeline_len_years.append(l_years)\n",
+    "    \n",
+    "    # Years\n",
+    "    inds = [i for i, v in enumerate(e['token_type']) if v == 'age']   \n",
+    "    once = False\n",
+    "    old_age_group = -1\n",
+    "    print(e['patient_id'], inds)\n",
+    "    for ind in inds:\n",
+    "        y = int(e['stream'][ind])\n",
+    "        # Use the last ind to determine pts current age\n",
+    "        if ind == inds[-1]:\n",
+    "            once = True\n",
+    "            \n",
+    "        if y <= 18:\n",
+    "            if old_age_group != '0-18':\n",
+    "                age_groups['0-18'].append(y)\n",
+    "                old_age_group = '0-18'\n",
+    "            if once:\n",
+    "                timeline_len_by_age['0-18'].append(t_len)\n",
+    "                timeline_len_by_age_y['0-18'].append(l_years)\n",
+    "        elif y <= 30:\n",
+    "            if old_age_group != '18-30':\n",
+    "                age_groups['18-30'].append(y)\n",
+    "                old_age_group = '18-30'\n",
+    "            if once:\n",
+    "                timeline_len_by_age['18-30'].append(t_len)\n",
+    "                timeline_len_by_age_y['18-30'].append(l_years)\n",
+    "\n",
+    "        elif y <= 41:\n",
+    "            if old_age_group != '30-41':\n",
+    "                age_groups['30-41'].append(y)\n",
+    "                old_age_group = '30-41'\n",
+    "            if once:\n",
+    "                timeline_len_by_age['30-41'].append(t_len)\n",
+    "                timeline_len_by_age_y['30-41'].append(l_years)\n",
+    "        elif y <= 50:\n",
+    "            if old_age_group != '41-50':\n",
+    "                age_groups['41-50'].append(y)\n",
+    "                old_age_group = '41-50'\n",
+    "            if once:\n",
+    "                timeline_len_by_age['41-50'].append(t_len)\n",
+    "                timeline_len_by_age_y['41-50'].append(l_years)\n",
+    "        elif y <= 64:\n",
+    "            if old_age_group != '51-64':\n",
+    "                age_groups['51-64'].append(y)\n",
+    "                old_age_group = '51-64'\n",
+    "            if once:\n",
+    "                timeline_len_by_age['51-64'].append(t_len)\n",
+    "                timeline_len_by_age_y['51-64'].append(l_years)\n",
+    "        else:\n",
+    "            if old_age_group != '64+':\n",
+    "                age_groups['64+'].append(y)\n",
+    "                old_age_group = '64+'\n",
+    "            if once:\n",
+    "                timeline_len_by_age['64+'].append(t_len)\n",
+    "                timeline_len_by_age_y['64+'].append(l_years)\n",
+    "        once = False\n",
+    "\n",
+    "    # Sex\n",
+    "    if 'sex' in e['token_type']:\n",
+    "        ind = e['token_type'].index('sex')\n",
+    "        val = e['stream'][ind]\n",
+    "        if val == 'Female' or val == 'F':\n",
+    "            sex['Female'] += 1\n",
+    "            timeline_len_by_sex['Female'].append(t_len)\n",
+    "            timeline_len_by_sex_y['Female'].append(l_years)\n",
+    "        elif val == 'Male' or val == 'M':\n",
+    "            sex['Male'] += 1\n",
+    "            timeline_len_by_sex['Male'].append(t_len)\n",
+    "            timeline_len_by_sex_y['Male'].append(l_years)\n",
+    "        else:\n",
+    "            sex['Unknown'] += 1\n",
+    "            timeline_len_by_sex['Unknown'].append(t_len)\n",
+    "            timeline_len_by_sex_y['Unknown'].append(l_years)\n",
+    "    else:\n",
+    "        sex['Unknown'] += 1\n",
+    "        timeline_len_by_sex['Unknown'].append(t_len)\n",
+    "        timeline_len_by_sex_y['Unknown'].append(l_years)\n",
+    "        \n",
+    "    # Ethnicity\n",
+    "    if 'ethnicity' in e['token_type']:\n",
+    "        ind = e['token_type'].index('ethnicity')\n",
+    "        val = e['stream'][ind]\n",
+    "        if val in ethnicity:\n",
+    "            ethnicity[val] += 1\n",
+    "            timeline_len_by_eth[val].append(t_len)\n",
+    "            timeline_len_by_eth_y[val].append(l_years)\n",
+    "        else:\n",
+    "            ethnicity[val] = 1\n",
+    "            timeline_len_by_eth[val] = [t_len]\n",
+    "            timeline_len_by_eth_y[val] = [l_years]\n",
+    "    else:\n",
+    "        if 'Unknown' in ethnicity:\n",
+    "            ethnicity['Unknown'] += 1\n",
+    "            timeline_len_by_eth['Unknown'].append(t_len)\n",
+    "            timeline_len_by_eth_y['Unknown'].append(l_years)\n",
+    "        else:\n",
+    "            ethnicity['Unknown'] = 1\n",
+    "            timeline_len_by_eth['Unknown'] = [t_len]\n",
+    "            timeline_len_by_eth_y['Unknown'] = [l_years]\n",
+    "    \n",
+    "    # Concepts per CUI\n",
+    "    #vals = [v for v in e['token_type'] if v.startswith('T-')]\n",
+    "    for val in all_types:\n",
+    "        title = cat.cdb.addl_info['type_id2name'][val].title()\n",
+    "        if title in len_per_type:\n",
+    "            len_per_type[title].append(len([x for x in e['token_type'] if x == val]))\n",
+    "        else:\n",
+    "            len_per_type[title] = [len([x for x in e['token_type'] if x == val])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de1cf1aa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mean number of concepts of certain type per pt\n",
+    "fprint(\"Mean number of concepts of certain type per pt\")\n",
+    "for t in len_per_type:\n",
+    "    fprint(\"{:30} : {}\".format(t, np.mean(len_per_type[t])))\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29ae3172",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Mean timeline length by age group\n",
+    "fprint(\"Mean timeline length by age group\")\n",
+    "fprint(timeline_len_by_age.keys(), '')\n",
+    "for age in timeline_len_by_age:\n",
+    "    fprint(\"{:.0f} ({:.1f})\".format(np.mean(timeline_len_by_age[age]), np.mean(timeline_len_by_age_y[age])))\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd67782c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "new_timeline_len_by_eth = {'White': [], 'Black': [], 'Other': [], \n",
+    "                 'Asian': [], 'Unknown': [], 'Mixed': []}\n",
+    "new_timeline_len_by_eth_y = {'White': [], 'Black': [], 'Other': [], \n",
+    "                 'Asian': [], 'Unknown': [], 'Mixed': []}\n",
+    "\n",
+    "for eth in timeline_len_by_eth:\n",
+    "    if 'ASIAN' in eth:\n",
+    "        new_timeline_len_by_eth['Asian'].extend(timeline_len_by_eth[eth])\n",
+    "        new_timeline_len_by_eth_y['Asian'].extend(timeline_len_by_eth_y[eth])\n",
+    "    elif 'BLACK' in eth:\n",
+    "        new_timeline_len_by_eth['Black'].extend(timeline_len_by_eth[eth])\n",
+    "        new_timeline_len_by_eth_y['Black'].extend(timeline_len_by_eth_y[eth])\n",
+    "    elif 'WHITE' in eth:\n",
+    "        new_timeline_len_by_eth['White'].extend(timeline_len_by_eth[eth])\n",
+    "        new_timeline_len_by_eth_y['White'].extend(timeline_len_by_eth_y[eth])\n",
+    "    elif 'UNKNOWN' in eth or 'PATIENT DECLINED TO ANSWER' in eth or 'UNABLE TO OBTAIN' in eth:\n",
+    "        new_timeline_len_by_eth['Unknown'].extend(timeline_len_by_eth[eth])\n",
+    "        new_timeline_len_by_eth_y['Unknown'].extend(timeline_len_by_eth_y[eth])\n",
+    "    elif 'MULTI' in eth:\n",
+    "        new_timeline_len_by_eth['Mixed'].extend(timeline_len_by_eth[eth])\n",
+    "        new_timeline_len_by_eth_y['Mixed'].extend(timeline_len_by_eth_y[eth])\n",
+    "    else:\n",
+    "        new_timeline_len_by_eth['Other'].extend(timeline_len_by_eth[eth])\n",
+    "        new_timeline_len_by_eth_y['Other'].extend(timeline_len_by_eth_y[eth])\n",
+    "\n",
+    "fprint(\"Mean timeline length by ethnicity\")\n",
+    "for eth in new_timeline_len_by_eth:\n",
+    "    fprint(\"{:10} : {:.0f} ({:.1f})\".format(eth, np.mean(new_timeline_len_by_eth[eth]), \n",
+    "                                            np.mean(new_timeline_len_by_eth_y[eth])))\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6e999d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fprint(\"Mean timeline length by sex\")\n",
+    "fprint(timeline_len_by_sex.keys(), '')\n",
+    "for s in timeline_len_by_sex:\n",
+    "    fprint(\"{:.0f} ({:.1f})\".format(np.mean(timeline_len_by_sex[s]), np.mean(timeline_len_by_sex_y[s])))\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5a1f4fb3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fprint(\"Mean timeline len: \", np.mean(timeline_lens))\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4aa01a65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Number of pts by ethnicity\n",
+    "#fprint(\"Ethnicity: \", ethnicity)\n",
+    "new_ethnicity = {'White': 0, 'Black': 0, 'Other': 0, \n",
+    "                 'Asian': 0, 'Unknown': 0, 'Mixed': 0}\n",
+    "\n",
+    "for eth in ethnicity:\n",
+    "    if 'ASIAN' in eth:\n",
+    "        new_ethnicity['Asian'] += ethnicity[eth]\n",
+    "    elif 'BLACK' in eth:\n",
+    "        new_ethnicity['Black'] += ethnicity[eth]\n",
+    "    elif 'WHITE' in eth:\n",
+    "        new_ethnicity['White'] += ethnicity[eth]\n",
+    "    elif 'UNKNOWN' in eth or 'PATIENT DECLINED TO ANSWER' in eth or 'UNABLE TO OBTAIN' in eth:\n",
+    "        new_ethnicity['Unknown'] += ethnicity[eth]\n",
+    "    elif 'MULTI' in eth:\n",
+    "        new_ethnicity['Mixed'] += ethnicity[eth]\n",
+    "    else:\n",
+    "        new_ethnicity['Other'] += ethnicity[eth]\n",
+    "fprint(new_ethnicity)\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c16fe4f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Number of pts by sex\n",
+    "fprint(sex)\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f0aae15a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fprint(\"Total pts for sex: \", sum(sex.values()))\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d4401a1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3b19d621",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Number of pts by age (note that we are multi counting, if for one pt we have age 27, 28 and 35 that will be three counts)\n",
+    "t_cnt = 0\n",
+    "fprint(\"Age group, mean age for group, number of patients in this group (with multi counting)\")\n",
+    "for g in age_groups:\n",
+    "    fprint('{} - {:.3f} - {}'.format(g, np.mean(age_groups[g]), len(age_groups[g])))\n",
+    "    t_cnt += len(age_groups[g])\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f14d0451",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Overall timeline mean length in years \n",
+    "fprint('Timeline len in years: ', np.mean(timeline_len_years))\n",
+    "fprint('\\n')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35f6bf1c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ds_info.close()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}