696 lines (695 with data), 22.4 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "fba6b486",
"metadata": {},
"outputs": [],
"source": [
"# Used for development\n",
"#import sys\n",
"#sys.path.insert(0, \"../foresight/\")\n",
"#import sys\n",
"#sys.path.insert(0, \"../MedCAT/\")\n",
"#%load_ext autoreload\n",
"#%autoreload 2"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbbfb963",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import sys\n",
"import os\n",
"import pickle\n",
"import datasets\n",
"import numpy as np\n",
"from medcat.cat import CAT\n",
"from datetime import datetime"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "82fa5c28",
"metadata": {},
"outputs": [],
"source": [
"DATASET = 'test'\n",
"DAYS = 1 # Do: 1, 14, 30\n",
"MAX_SEQ_LEN = 256\n",
"TYPES = ['ALL_TYPES']\n",
"\n",
"BASE_NAME = 'annotated_february_2022'\n",
"DATASET_NAME = 'annotations_stream_phase2_v1'\n",
"RUN_NAME = f'{DATASET_NAME}_{DAYS}d_{MAX_SEQ_LEN}_{\"_\".join(TYPES)}'\n",
"DATA_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{DATASET_NAME}.pickle\"\n",
"DATA_PATH_SPLITS = f\"./data/timecat/mimic/{BASE_NAME}/{DATASET_NAME}_split/\"\n",
"TOKENIZER_PATH = f\"./data/timecat/models/gpt/tokenizer_{RUN_NAME}.pickle\"\n",
"ALMOST_PREPARED_DATASET_SPLIT_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_almost_prepared_split/\"\n",
"PREPARED_DATASET_SPLIT_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_prepared_split/\"\n",
"JUST_BEFORE_ENCODING_DATASET_SPLIT_PATH = f\"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_just_before_encoding/\"\n",
"CAT_PATH = \"./data/models/modelpacks/mc_modelpack_phase2_snomed_190k_february_2022.zip\"\n",
"PT_DOB_PATH = \"./data/mimic/pt2dob_datetime.pickle\"\n",
"PT_DOD_PATH = \"./data/mimic/pt2dod_timestamp.pickle\"\n",
"PT_SEX_PATH = \"./data/mimic/pt2sex.pickle\"\n",
"PT_LNS_PATH = f\"./data/timecat/mimic/{BASE_NAME}/lns_{DATASET_NAME}.pickle\"\n",
"PT_CNTS_PATH = f\"./data/timecat/mimic/{BASE_NAME}/cnts_{DATASET_NAME}.pickle\"\n",
"PT_ETHNICITY_PATH = \"./data/mimic/pt2ethnicity.pickle\"\n",
"TOKEN_TYPES_PATH = f'./data/timecat/mimic/{BASE_NAME}/types_{DATASET_NAME}.pickle'"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "86a91f2f",
"metadata": {},
"outputs": [],
"source": [
"ds_info = open(\"dataset-metrics-\" + DATASET + '-' + RUN_NAME + '.txt', 'w')\n",
"def fprint(*texts):\n",
" for text in texts:\n",
" print(text)\n",
" ds_info.write(str(text) + \"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7e48922b",
"metadata": {},
"outputs": [],
"source": [
"from foresight.metrics.next_concept_prediction import precision, metrics_data2df, ComputePrecisionHF\n",
"from foresight.tokenizers.simple_map_tokenizer import SimpleMapTokenizer\n",
"TOKENIZER_PATH = f\"/home/wish/data/timecat/models/gpt/tokenizer_{RUN_NAME}.pickle\""
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4b17b753",
"metadata": {},
"outputs": [],
"source": [
"data = [('Concept Type', 'Time (in days)', 'Top-K', 'Overall (MIMIC)', 'New (MIMIC)', 'Old (MIMIC)',\n",
" 'Recall All', 'Recall New', 'Recall Old')]\n",
"tmap = {'T-11': 'Disorders', 'T-55': 'Substances', 'T-18': 'Findings', 'all': \"All Concepts\", 'T-39': 'Procedures'}\n",
"for name in os.listdir(\"./metrics/\"):\n",
" if name.startswith(\"start-0\"):\n",
" m = pickle.load(open(\"./metrics/\" + name, 'rb'))\n",
" p = name.split(\"_\")\n",
" topk = p[1].split(\"-\")[1]\n",
" time = int(p[3].split(\"-\")[1])\n",
" time = int(time)\n",
" types = p[4].split(\".\")[0].split(\"types-\")[1]\n",
" types = tmap[types]\n",
" data.append((types, time, topk, \n",
" \"{:.2f}\".format(m['precision']['all']), \n",
" \"{:.2f}\".format(m['precision']['new']), \n",
" \"{:.2f}\".format(m['precision']['old']),\n",
" \"{:.2f}\".format(m['recall']['all']),\n",
" \"{:.2f}\".format(m['recall']['new']),\n",
" \"{:.2f}\".format(m['recall']['old'])))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5668ed16",
"metadata": {
"scrolled": false
},
"outputs": [],
"source": [
"df = pd.DataFrame(data[1:], columns=data[0])\n",
"df['Top-K'] = [int(x) for x in df['Top-K'].values]\n",
"df = df.sort_values(by=['Concept Type', 'Time (in days)', 'Top-K'])\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6af1d523",
"metadata": {},
"outputs": [],
"source": [
"df.to_csv(\"./summary.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "94065191",
"metadata": {},
"outputs": [],
"source": [
"tokenizer = SimpleMapTokenizer.load(TOKENIZER_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "37d6d42c",
"metadata": {},
"outputs": [],
"source": [
"# For the standard model get top 20 best performing concepts\n",
"m = pickle.load(open('./start-0_topk-1_time_range-30_types-all_types.pickle', 'rb'))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "328ab29b",
"metadata": {},
"outputs": [],
"source": [
"df_new = metrics_data2df(m, tkn2name=tokenizer.tkn2name, temporality='new')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6cda4510",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_new.head(n=20).to_csv(\"./top_20_cuis_new.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f2fa1496",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_new.head(n=20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "353b719e",
"metadata": {},
"outputs": [],
"source": [
"df_new[df_new.negatives>100].tail(n=20).to_csv('bottom_20_cuis_with_min_100_negatives_new.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "46671f82",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df_new[df_new.negatives>100].tail(n=20)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3e9488a7",
"metadata": {},
"outputs": [],
"source": [
"df_old = metrics_data2df(m, tkn2name=tokenizer.tkn2name, temporality='old')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "602b9d17",
"metadata": {},
"outputs": [],
"source": [
"df_old.head(n=20).to_csv(\"./top_20_cuis_old.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4065a3b7",
"metadata": {},
"outputs": [],
"source": [
"df_old[df_old.negatives>100].tail(n=20).to_csv('bottom_20_cuis_with_min_100_negatives_old.csv')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "33e16c89",
"metadata": {},
"outputs": [],
"source": [
"df_new[df_new.positives > 10]"
]
},
{
"cell_type": "markdown",
"id": "125dfbd6",
"metadata": {},
"source": [
"# Dataset Metrics"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5d6036ff",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"cat = CAT.load_model_pack(CAT_PATH, meta_cat_config_dict={'general': {'device': 'cpu'}})"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a84ddda2",
"metadata": {},
"outputs": [],
"source": [
"dataset = datasets.load_from_disk(JUST_BEFORE_ENCODING_DATASET_SPLIT_PATH)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "262b0aec",
"metadata": {},
"outputs": [],
"source": [
"dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "26b62828",
"metadata": {},
"outputs": [],
"source": [
"id2inds = {}\n",
"for ind, row in enumerate(dataset[DATASET]):\n",
" if row['patient_id'] in id2inds:\n",
" id2inds[row['patient_id']].append(ind)\n",
" else:\n",
" id2inds[row['patient_id']] = [ind]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7c4509e2",
"metadata": {},
"outputs": [],
"source": [
"data = dataset[DATASET].to_dict()\n",
"for id in id2inds:\n",
" inds = id2inds[id]\n",
" if len(inds) > 1:\n",
" for ind in inds[1:]:\n",
" data['stream'][inds[0]].extend(data['stream'][ind])\n",
" data['token_type'][inds[0]].extend(data['token_type'][ind])\n",
" data['time'][inds[0]].extend(data['time'][ind])\n",
" data['patient_id'][ind] = \"SKIP\"\n",
"dataset_combined = datasets.Dataset.from_dict(data)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "fb192cce",
"metadata": {},
"outputs": [],
"source": [
"timeline_lens = []\n",
"timeline_len_years = []\n",
"timeline_len_by_sex = {'Female': [], 'Male': [], 'Unknown': []}\n",
"timeline_len_by_sex_y = {'Female': [], 'Male': [], 'Unknown': []}\n",
"timeline_len_by_eth = {}\n",
"timeline_len_by_eth_y = {}\n",
"timeline_len_by_age = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}\n",
"timeline_len_by_age_y = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}\n",
"len_per_type = {}\n",
"sex = {'Female': 0, 'Male': 0, 'Unknown': 0}\n",
"age_groups = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}\n",
"ethnicity = {}\n",
"all_types = set([x for x in tokenizer.token_type2tokens.keys() if x.startswith('T-')])\n",
"\n",
"for e in dataset_combined:\n",
" if e['patient_id'] == 'SKIP':\n",
" continue\n",
" \n",
" t_len = len([x for x in e['token_type'] if x.startswith(\"T-\")])\n",
" timeline_lens.append(t_len)\n",
"\n",
" # Timeline in years\n",
" l_years = (datetime.fromtimestamp(e['time'][-1]) - datetime.fromtimestamp(e['time'][0])).days / 365\n",
" if l_years < 0:\n",
" l_years = 0\n",
" timeline_len_years.append(l_years)\n",
" \n",
" # Years\n",
" inds = [i for i, v in enumerate(e['token_type']) if v == 'age'] \n",
" once = False\n",
" old_age_group = -1\n",
" print(e['patient_id'], inds)\n",
" for ind in inds:\n",
" y = int(e['stream'][ind])\n",
" # Use the last ind to determine pts current age\n",
" if ind == inds[-1]:\n",
" once = True\n",
" \n",
" if y <= 18:\n",
" if old_age_group != '0-18':\n",
" age_groups['0-18'].append(y)\n",
" old_age_group = '0-18'\n",
" if once:\n",
" timeline_len_by_age['0-18'].append(t_len)\n",
" timeline_len_by_age_y['0-18'].append(l_years)\n",
" elif y <= 30:\n",
" if old_age_group != '18-30':\n",
" age_groups['18-30'].append(y)\n",
" old_age_group = '18-30'\n",
" if once:\n",
" timeline_len_by_age['18-30'].append(t_len)\n",
" timeline_len_by_age_y['18-30'].append(l_years)\n",
"\n",
" elif y <= 41:\n",
" if old_age_group != '30-41':\n",
" age_groups['30-41'].append(y)\n",
" old_age_group = '30-41'\n",
" if once:\n",
" timeline_len_by_age['30-41'].append(t_len)\n",
" timeline_len_by_age_y['30-41'].append(l_years)\n",
" elif y <= 50:\n",
" if old_age_group != '41-50':\n",
" age_groups['41-50'].append(y)\n",
" old_age_group = '41-50'\n",
" if once:\n",
" timeline_len_by_age['41-50'].append(t_len)\n",
" timeline_len_by_age_y['41-50'].append(l_years)\n",
" elif y <= 64:\n",
" if old_age_group != '51-64':\n",
" age_groups['51-64'].append(y)\n",
" old_age_group = '51-64'\n",
" if once:\n",
" timeline_len_by_age['51-64'].append(t_len)\n",
" timeline_len_by_age_y['51-64'].append(l_years)\n",
" else:\n",
" if old_age_group != '64+':\n",
" age_groups['64+'].append(y)\n",
" old_age_group = '64+'\n",
" if once:\n",
" timeline_len_by_age['64+'].append(t_len)\n",
" timeline_len_by_age_y['64+'].append(l_years)\n",
" once = False\n",
"\n",
" # Sex\n",
" if 'sex' in e['token_type']:\n",
" ind = e['token_type'].index('sex')\n",
" val = e['stream'][ind]\n",
" if val == 'Female' or val == 'F':\n",
" sex['Female'] += 1\n",
" timeline_len_by_sex['Female'].append(t_len)\n",
" timeline_len_by_sex_y['Female'].append(l_years)\n",
" elif val == 'Male' or val == 'M':\n",
" sex['Male'] += 1\n",
" timeline_len_by_sex['Male'].append(t_len)\n",
" timeline_len_by_sex_y['Male'].append(l_years)\n",
" else:\n",
" sex['Unknown'] += 1\n",
" timeline_len_by_sex['Unknown'].append(t_len)\n",
" timeline_len_by_sex_y['Unknown'].append(l_years)\n",
" else:\n",
" sex['Unknown'] += 1\n",
" timeline_len_by_sex['Unknown'].append(t_len)\n",
" timeline_len_by_sex_y['Unknown'].append(l_years)\n",
" \n",
" # Ethnicity\n",
" if 'ethnicity' in e['token_type']:\n",
" ind = e['token_type'].index('ethnicity')\n",
" val = e['stream'][ind]\n",
" if val in ethnicity:\n",
" ethnicity[val] += 1\n",
" timeline_len_by_eth[val].append(t_len)\n",
" timeline_len_by_eth_y[val].append(l_years)\n",
" else:\n",
" ethnicity[val] = 1\n",
" timeline_len_by_eth[val] = [t_len]\n",
" timeline_len_by_eth_y[val] = [l_years]\n",
" else:\n",
" if 'Unknown' in ethnicity:\n",
" ethnicity['Unknown'] += 1\n",
" timeline_len_by_eth['Unknown'].append(t_len)\n",
" timeline_len_by_eth_y['Unknown'].append(l_years)\n",
" else:\n",
" ethnicity['Unknown'] = 1\n",
" timeline_len_by_eth['Unknown'] = [t_len]\n",
" timeline_len_by_eth_y['Unknown'] = [l_years]\n",
" \n",
" # Concepts per CUI\n",
" #vals = [v for v in e['token_type'] if v.startswith('T-')]\n",
" for val in all_types:\n",
" title = cat.cdb.addl_info['type_id2name'][val].title()\n",
" if title in len_per_type:\n",
" len_per_type[title].append(len([x for x in e['token_type'] if x == val]))\n",
" else:\n",
" len_per_type[title] = [len([x for x in e['token_type'] if x == val])]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "de1cf1aa",
"metadata": {},
"outputs": [],
"source": [
"# Mean number of concepts of certain type per pt\n",
"fprint(\"Mean number of concepts of certain type per pt\")\n",
"for t in len_per_type:\n",
" fprint(\"{:30} : {}\".format(t, np.mean(len_per_type[t])))\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "29ae3172",
"metadata": {},
"outputs": [],
"source": [
"# Mean timeline length by age group\n",
"fprint(\"Mean timeline length by age group\")\n",
"fprint(timeline_len_by_age.keys(), '')\n",
"for age in timeline_len_by_age:\n",
" fprint(\"{:.0f} ({:.1f})\".format(np.mean(timeline_len_by_age[age]), np.mean(timeline_len_by_age_y[age])))\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dd67782c",
"metadata": {},
"outputs": [],
"source": [
"new_timeline_len_by_eth = {'White': [], 'Black': [], 'Other': [], \n",
" 'Asian': [], 'Unknown': [], 'Mixed': []}\n",
"new_timeline_len_by_eth_y = {'White': [], 'Black': [], 'Other': [], \n",
" 'Asian': [], 'Unknown': [], 'Mixed': []}\n",
"\n",
"for eth in timeline_len_by_eth:\n",
" if 'ASIAN' in eth:\n",
" new_timeline_len_by_eth['Asian'].extend(timeline_len_by_eth[eth])\n",
" new_timeline_len_by_eth_y['Asian'].extend(timeline_len_by_eth_y[eth])\n",
" elif 'BLACK' in eth:\n",
" new_timeline_len_by_eth['Black'].extend(timeline_len_by_eth[eth])\n",
" new_timeline_len_by_eth_y['Black'].extend(timeline_len_by_eth_y[eth])\n",
" elif 'WHITE' in eth:\n",
" new_timeline_len_by_eth['White'].extend(timeline_len_by_eth[eth])\n",
" new_timeline_len_by_eth_y['White'].extend(timeline_len_by_eth_y[eth])\n",
" elif 'UNKNOWN' in eth or 'PATIENT DECLINED TO ANSWER' in eth or 'UNABLE TO OBTAIN' in eth:\n",
" new_timeline_len_by_eth['Unknown'].extend(timeline_len_by_eth[eth])\n",
" new_timeline_len_by_eth_y['Unknown'].extend(timeline_len_by_eth_y[eth])\n",
" elif 'MULTI' in eth:\n",
" new_timeline_len_by_eth['Mixed'].extend(timeline_len_by_eth[eth])\n",
" new_timeline_len_by_eth_y['Mixed'].extend(timeline_len_by_eth_y[eth])\n",
" else:\n",
" new_timeline_len_by_eth['Other'].extend(timeline_len_by_eth[eth])\n",
" new_timeline_len_by_eth_y['Other'].extend(timeline_len_by_eth_y[eth])\n",
"\n",
"fprint(\"Mean timeline length by ethnicity\")\n",
"for eth in new_timeline_len_by_eth:\n",
" fprint(\"{:10} : {:.0f} ({:.1f})\".format(eth, np.mean(new_timeline_len_by_eth[eth]), \n",
" np.mean(new_timeline_len_by_eth_y[eth])))\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c6e999d8",
"metadata": {},
"outputs": [],
"source": [
"fprint(\"Mean timeline length by sex\")\n",
"fprint(timeline_len_by_sex.keys(), '')\n",
"for s in timeline_len_by_sex:\n",
" fprint(\"{:.0f} ({:.1f})\".format(np.mean(timeline_len_by_sex[s]), np.mean(timeline_len_by_sex_y[s])))\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5a1f4fb3",
"metadata": {},
"outputs": [],
"source": [
"fprint(\"Mean timeline len: \", np.mean(timeline_lens))\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4aa01a65",
"metadata": {},
"outputs": [],
"source": [
"# Number of pts by ethnicity\n",
"#fprint(\"Ethnicity: \", ethnicity)\n",
"new_ethnicity = {'White': 0, 'Black': 0, 'Other': 0, \n",
" 'Asian': 0, 'Unknown': 0, 'Mixed': 0}\n",
"\n",
"for eth in ethnicity:\n",
" if 'ASIAN' in eth:\n",
" new_ethnicity['Asian'] += ethnicity[eth]\n",
" elif 'BLACK' in eth:\n",
" new_ethnicity['Black'] += ethnicity[eth]\n",
" elif 'WHITE' in eth:\n",
" new_ethnicity['White'] += ethnicity[eth]\n",
" elif 'UNKNOWN' in eth or 'PATIENT DECLINED TO ANSWER' in eth or 'UNABLE TO OBTAIN' in eth:\n",
" new_ethnicity['Unknown'] += ethnicity[eth]\n",
" elif 'MULTI' in eth:\n",
" new_ethnicity['Mixed'] += ethnicity[eth]\n",
" else:\n",
" new_ethnicity['Other'] += ethnicity[eth]\n",
"fprint(new_ethnicity)\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c16fe4f2",
"metadata": {},
"outputs": [],
"source": [
"# Number of pts by sex\n",
"fprint(sex)\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f0aae15a",
"metadata": {},
"outputs": [],
"source": [
"fprint(\"Total pts for sex: \", sum(sex.values()))\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d4401a1c",
"metadata": {},
"outputs": [],
"source": [
"dataset"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3b19d621",
"metadata": {},
"outputs": [],
"source": [
"# Number of pts by age (note that we are multi counting, if for one pt we have age 27, 28 and 35 that will be three counts)\n",
"t_cnt = 0\n",
"fprint(\"Age group, mean age for group, number of patients in this group (with multi counting)\")\n",
"for g in age_groups:\n",
" fprint('{} - {:.3f} - {}'.format(g, np.mean(age_groups[g]), len(age_groups[g])))\n",
" t_cnt += len(age_groups[g])\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f14d0451",
"metadata": {},
"outputs": [],
"source": [
"# Overall timeline mean length in years \n",
"fprint('Timeline len in years: ', np.mean(timeline_len_years))\n",
"fprint('\\n')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "35f6bf1c",
"metadata": {},
"outputs": [],
"source": [
"ds_info.close()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}