In [None]:
# Used for development
#import sys
#sys.path.insert(0, "../foresight/")
#import sys
#sys.path.insert(0, "../MedCAT/")
#%load_ext autoreload
#%autoreload 2

In [None]:
import pandas as pd
import sys
import os
import pickle
import datasets
import numpy as np
from medcat.cat import CAT
from datetime import datetime

In [None]:
DATASET = 'test'
DAYS = 1 # Do: 1, 14, 30
MAX_SEQ_LEN = 256
TYPES = ['ALL_TYPES']

BASE_NAME = 'annotated_february_2022'
DATASET_NAME = 'annotations_stream_phase2_v1'
RUN_NAME = f'{DATASET_NAME}_{DAYS}d_{MAX_SEQ_LEN}_{"_".join(TYPES)}'
DATA_PATH = f"./data/timecat/mimic/{BASE_NAME}/{DATASET_NAME}.pickle"
DATA_PATH_SPLITS = f"./data/timecat/mimic/{BASE_NAME}/{DATASET_NAME}_split/"
TOKENIZER_PATH = f"./data/timecat/models/gpt/tokenizer_{RUN_NAME}.pickle"
ALMOST_PREPARED_DATASET_SPLIT_PATH = f"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_almost_prepared_split/"
PREPARED_DATASET_SPLIT_PATH = f"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_prepared_split/"
JUST_BEFORE_ENCODING_DATASET_SPLIT_PATH = f"./data/timecat/mimic/{BASE_NAME}/{RUN_NAME}_just_before_encoding/"
CAT_PATH = "./data/models/modelpacks/mc_modelpack_phase2_snomed_190k_february_2022.zip"
PT_DOB_PATH = "./data/mimic/pt2dob_datetime.pickle"
PT_DOD_PATH = "./data/mimic/pt2dod_timestamp.pickle"
PT_SEX_PATH = "./data/mimic/pt2sex.pickle"
PT_LNS_PATH = f"./data/timecat/mimic/{BASE_NAME}/lns_{DATASET_NAME}.pickle"
PT_CNTS_PATH = f"./data/timecat/mimic/{BASE_NAME}/cnts_{DATASET_NAME}.pickle"
PT_ETHNICITY_PATH = "./data/mimic/pt2ethnicity.pickle"
TOKEN_TYPES_PATH = f'./data/timecat/mimic/{BASE_NAME}/types_{DATASET_NAME}.pickle'

In [None]:
ds_info = open("dataset-metrics-" + DATASET + '-' + RUN_NAME + '.txt', 'w')
def fprint(*texts):
    for text in texts:
        print(text)
        ds_info.write(str(text) + "\n")

In [None]:
from foresight.metrics.next_concept_prediction import precision, metrics_data2df, ComputePrecisionHF
from foresight.tokenizers.simple_map_tokenizer import SimpleMapTokenizer
TOKENIZER_PATH = f"/home/wish/data/timecat/models/gpt/tokenizer_{RUN_NAME}.pickle"

In [None]:
data = [('Concept Type', 'Time (in days)', 'Top-K', 'Overall (MIMIC)', 'New (MIMIC)', 'Old (MIMIC)',
         'Recall All', 'Recall New', 'Recall Old')]
tmap = {'T-11': 'Disorders', 'T-55': 'Substances', 'T-18': 'Findings', 'all': "All Concepts", 'T-39': 'Procedures'}
for name in os.listdir("./metrics/"):
    if name.startswith("start-0"):
        m = pickle.load(open("./metrics/" + name, 'rb'))
        p = name.split("_")
        topk = p[1].split("-")[1]
        time = int(p[3].split("-")[1])
        time = int(time)
        types = p[4].split(".")[0].split("types-")[1]
        types = tmap[types]
        data.append((types, time, topk, 
                     "{:.2f}".format(m['precision']['all']), 
                     "{:.2f}".format(m['precision']['new']), 
                     "{:.2f}".format(m['precision']['old']),
                     "{:.2f}".format(m['recall']['all']),
                     "{:.2f}".format(m['recall']['new']),
                     "{:.2f}".format(m['recall']['old'])))

In [None]:
df = pd.DataFrame(data[1:], columns=data[0])
df['Top-K'] = [int(x) for x in df['Top-K'].values]
df = df.sort_values(by=['Concept Type', 'Time (in days)', 'Top-K'])
df

In [None]:
df.to_csv("./summary.csv")

In [None]:
tokenizer = SimpleMapTokenizer.load(TOKENIZER_PATH)

In [None]:
# For the standard model get top 20 best performing concepts
m = pickle.load(open('./start-0_topk-1_time_range-30_types-all_types.pickle', 'rb'))

In [None]:
df_new = metrics_data2df(m, tkn2name=tokenizer.tkn2name, temporality='new')

In [None]:
df_new.head(n=20).to_csv("./top_20_cuis_new.csv")

In [None]:
df_new.head(n=20)

In [None]:
df_new[df_new.negatives>100].tail(n=20).to_csv('bottom_20_cuis_with_min_100_negatives_new.csv')

In [None]:
df_new[df_new.negatives>100].tail(n=20)

In [None]:
df_old = metrics_data2df(m, tkn2name=tokenizer.tkn2name, temporality='old')

In [None]:
df_old.head(n=20).to_csv("./top_20_cuis_old.csv")

In [None]:
df_old[df_old.negatives>100].tail(n=20).to_csv('bottom_20_cuis_with_min_100_negatives_old.csv')

In [None]:
df_new[df_new.positives > 10]

# Dataset Metrics

In [None]:
cat = CAT.load_model_pack(CAT_PATH, meta_cat_config_dict={'general': {'device': 'cpu'}})

In [None]:
dataset = datasets.load_from_disk(JUST_BEFORE_ENCODING_DATASET_SPLIT_PATH)

In [None]:
dataset

In [None]:
id2inds = {}
for ind, row in enumerate(dataset[DATASET]):
    if row['patient_id'] in id2inds:
        id2inds[row['patient_id']].append(ind)
    else:
        id2inds[row['patient_id']] = [ind]

In [None]:
data = dataset[DATASET].to_dict()
for id in id2inds:
    inds = id2inds[id]
    if len(inds) > 1:
        for ind in inds[1:]:
            data['stream'][inds[0]].extend(data['stream'][ind])
            data['token_type'][inds[0]].extend(data['token_type'][ind])
            data['time'][inds[0]].extend(data['time'][ind])
            data['patient_id'][ind] = "SKIP"
dataset_combined = datasets.Dataset.from_dict(data)

In [None]:
timeline_lens = []
timeline_len_years = []
timeline_len_by_sex = {'Female': [], 'Male': [], 'Unknown': []}
timeline_len_by_sex_y = {'Female': [], 'Male': [], 'Unknown': []}
timeline_len_by_eth = {}
timeline_len_by_eth_y = {}
timeline_len_by_age = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}
timeline_len_by_age_y = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}
len_per_type = {}
sex = {'Female': 0, 'Male': 0, 'Unknown': 0}
age_groups = {'0-18': [], '18-30': [], '30-41': [], '41-50': [], '51-64': [], '64+': []}
ethnicity = {}
all_types = set([x for x in tokenizer.token_type2tokens.keys() if x.startswith('T-')])

for e in dataset_combined:
    if e['patient_id'] == 'SKIP':
        continue
    
    t_len = len([x for x in e['token_type'] if x.startswith("T-")])
    timeline_lens.append(t_len)

    # Timeline in years
    l_years = (datetime.fromtimestamp(e['time'][-1]) - datetime.fromtimestamp(e['time'][0])).days / 365
    if l_years < 0:
        l_years = 0
    timeline_len_years.append(l_years)
    
    # Years
    inds = [i for i, v in enumerate(e['token_type']) if v == 'age']   
    once = False
    old_age_group = -1
    print(e['patient_id'], inds)
    for ind in inds:
        y = int(e['stream'][ind])
        # Use the last ind to determine pts current age
        if ind == inds[-1]:
            once = True
            
        if y <= 18:
            if old_age_group != '0-18':
                age_groups['0-18'].append(y)
                old_age_group = '0-18'
            if once:
                timeline_len_by_age['0-18'].append(t_len)
                timeline_len_by_age_y['0-18'].append(l_years)
        elif y <= 30:
            if old_age_group != '18-30':
                age_groups['18-30'].append(y)
                old_age_group = '18-30'
            if once:
                timeline_len_by_age['18-30'].append(t_len)
                timeline_len_by_age_y['18-30'].append(l_years)

        elif y <= 41:
            if old_age_group != '30-41':
                age_groups['30-41'].append(y)
                old_age_group = '30-41'
            if once:
                timeline_len_by_age['30-41'].append(t_len)
                timeline_len_by_age_y['30-41'].append(l_years)
        elif y <= 50:
            if old_age_group != '41-50':
                age_groups['41-50'].append(y)
                old_age_group = '41-50'
            if once:
                timeline_len_by_age['41-50'].append(t_len)
                timeline_len_by_age_y['41-50'].append(l_years)
        elif y <= 64:
            if old_age_group != '51-64':
                age_groups['51-64'].append(y)
                old_age_group = '51-64'
            if once:
                timeline_len_by_age['51-64'].append(t_len)
                timeline_len_by_age_y['51-64'].append(l_years)
        else:
            if old_age_group != '64+':
                age_groups['64+'].append(y)
                old_age_group = '64+'
            if once:
                timeline_len_by_age['64+'].append(t_len)
                timeline_len_by_age_y['64+'].append(l_years)
        once = False

    # Sex
    if 'sex' in e['token_type']:
        ind = e['token_type'].index('sex')
        val = e['stream'][ind]
        if val == 'Female' or val == 'F':
            sex['Female'] += 1
            timeline_len_by_sex['Female'].append(t_len)
            timeline_len_by_sex_y['Female'].append(l_years)
        elif val == 'Male' or val == 'M':
            sex['Male'] += 1
            timeline_len_by_sex['Male'].append(t_len)
            timeline_len_by_sex_y['Male'].append(l_years)
        else:
            sex['Unknown'] += 1
            timeline_len_by_sex['Unknown'].append(t_len)
            timeline_len_by_sex_y['Unknown'].append(l_years)
    else:
        sex['Unknown'] += 1
        timeline_len_by_sex['Unknown'].append(t_len)
        timeline_len_by_sex_y['Unknown'].append(l_years)
        
    # Ethnicity
    if 'ethnicity' in e['token_type']:
        ind = e['token_type'].index('ethnicity')
        val = e['stream'][ind]
        if val in ethnicity:
            ethnicity[val] += 1
            timeline_len_by_eth[val].append(t_len)
            timeline_len_by_eth_y[val].append(l_years)
        else:
            ethnicity[val] = 1
            timeline_len_by_eth[val] = [t_len]
            timeline_len_by_eth_y[val] = [l_years]
    else:
        if 'Unknown' in ethnicity:
            ethnicity['Unknown'] += 1
            timeline_len_by_eth['Unknown'].append(t_len)
            timeline_len_by_eth_y['Unknown'].append(l_years)
        else:
            ethnicity['Unknown'] = 1
            timeline_len_by_eth['Unknown'] = [t_len]
            timeline_len_by_eth_y['Unknown'] = [l_years]
    
    # Concepts per CUI
    #vals = [v for v in e['token_type'] if v.startswith('T-')]
    for val in all_types:
        title = cat.cdb.addl_info['type_id2name'][val].title()
        if title in len_per_type:
            len_per_type[title].append(len([x for x in e['token_type'] if x == val]))
        else:
            len_per_type[title] = [len([x for x in e['token_type'] if x == val])]

In [None]:
# Mean number of concepts of certain type per pt
fprint("Mean number of concepts of certain type per pt")
for t in len_per_type:
    fprint("{:30} : {}".format(t, np.mean(len_per_type[t])))
fprint('\n')

In [None]:
# Mean timeline length by age group
fprint("Mean timeline length by age group")
fprint(timeline_len_by_age.keys(), '')
for age in timeline_len_by_age:
    fprint("{:.0f} ({:.1f})".format(np.mean(timeline_len_by_age[age]), np.mean(timeline_len_by_age_y[age])))
fprint('\n')

In [None]:
new_timeline_len_by_eth = {'White': [], 'Black': [], 'Other': [], 
                 'Asian': [], 'Unknown': [], 'Mixed': []}
new_timeline_len_by_eth_y = {'White': [], 'Black': [], 'Other': [], 
                 'Asian': [], 'Unknown': [], 'Mixed': []}

for eth in timeline_len_by_eth:
    if 'ASIAN' in eth:
        new_timeline_len_by_eth['Asian'].extend(timeline_len_by_eth[eth])
        new_timeline_len_by_eth_y['Asian'].extend(timeline_len_by_eth_y[eth])
    elif 'BLACK' in eth:
        new_timeline_len_by_eth['Black'].extend(timeline_len_by_eth[eth])
        new_timeline_len_by_eth_y['Black'].extend(timeline_len_by_eth_y[eth])
    elif 'WHITE' in eth:
        new_timeline_len_by_eth['White'].extend(timeline_len_by_eth[eth])
        new_timeline_len_by_eth_y['White'].extend(timeline_len_by_eth_y[eth])
    elif 'UNKNOWN' in eth or 'PATIENT DECLINED TO ANSWER' in eth or 'UNABLE TO OBTAIN' in eth:
        new_timeline_len_by_eth['Unknown'].extend(timeline_len_by_eth[eth])
        new_timeline_len_by_eth_y['Unknown'].extend(timeline_len_by_eth_y[eth])
    elif 'MULTI' in eth:
        new_timeline_len_by_eth['Mixed'].extend(timeline_len_by_eth[eth])
        new_timeline_len_by_eth_y['Mixed'].extend(timeline_len_by_eth_y[eth])
    else:
        new_timeline_len_by_eth['Other'].extend(timeline_len_by_eth[eth])
        new_timeline_len_by_eth_y['Other'].extend(timeline_len_by_eth_y[eth])

fprint("Mean timeline length by ethnicity")
for eth in new_timeline_len_by_eth:
    fprint("{:10} : {:.0f} ({:.1f})".format(eth, np.mean(new_timeline_len_by_eth[eth]), 
                                            np.mean(new_timeline_len_by_eth_y[eth])))
fprint('\n')

In [None]:
fprint("Mean timeline length by sex")
fprint(timeline_len_by_sex.keys(), '')
for s in timeline_len_by_sex:
    fprint("{:.0f} ({:.1f})".format(np.mean(timeline_len_by_sex[s]), np.mean(timeline_len_by_sex_y[s])))
fprint('\n')

In [None]:
fprint("Mean timeline len: ", np.mean(timeline_lens))
fprint('\n')

In [None]:
# Number of pts by ethnicity
#fprint("Ethnicity: ", ethnicity)
new_ethnicity = {'White': 0, 'Black': 0, 'Other': 0, 
                 'Asian': 0, 'Unknown': 0, 'Mixed': 0}

for eth in ethnicity:
    if 'ASIAN' in eth:
        new_ethnicity['Asian'] += ethnicity[eth]
    elif 'BLACK' in eth:
        new_ethnicity['Black'] += ethnicity[eth]
    elif 'WHITE' in eth:
        new_ethnicity['White'] += ethnicity[eth]
    elif 'UNKNOWN' in eth or 'PATIENT DECLINED TO ANSWER' in eth or 'UNABLE TO OBTAIN' in eth:
        new_ethnicity['Unknown'] += ethnicity[eth]
    elif 'MULTI' in eth:
        new_ethnicity['Mixed'] += ethnicity[eth]
    else:
        new_ethnicity['Other'] += ethnicity[eth]
fprint(new_ethnicity)
fprint('\n')

In [None]:
# Number of pts by sex
fprint(sex)
fprint('\n')

In [None]:
fprint("Total pts for sex: ", sum(sex.values()))
fprint('\n')

In [None]:
dataset

In [None]:
# Number of pts by age (note that we are multi counting, if for one pt we have age 27, 28 and 35 that will be three counts)
t_cnt = 0
fprint("Age group, mean age for group, number of patients in this group (with multi counting)")
for g in age_groups:
    fprint('{} - {:.3f} - {}'.format(g, np.mean(age_groups[g]), len(age_groups[g])))
    t_cnt += len(age_groups[g])
fprint('\n')

In [None]:
# Overall timeline mean length in years 
fprint('Timeline len in years: ', np.mean(timeline_len_years))
fprint('\n')

In [None]:
ds_info.close()