In [None]:
! pip install icd10-cm

Collecting icd10-cm
  Downloading icd10_cm-0.0.4-py2.py3-none-any.whl (675 kB)
[?25l[K     |▌                               | 10 kB 26.9 MB/s eta 0:00:01[K     |█                               | 20 kB 12.0 MB/s eta 0:00:01[K     |█▌                              | 30 kB 9.7 MB/s eta 0:00:01[K     |██                              | 40 kB 8.5 MB/s eta 0:00:01[K     |██▍                             | 51 kB 4.4 MB/s eta 0:00:01[K     |███                             | 61 kB 5.2 MB/s eta 0:00:01[K     |███▍                            | 71 kB 5.6 MB/s eta 0:00:01[K     |███▉                            | 81 kB 5.7 MB/s eta 0:00:01[K     |████▍                           | 92 kB 6.4 MB/s eta 0:00:01[K     |████▉                           | 102 kB 5.1 MB/s eta 0:00:01[K     |█████▍                          | 112 kB 5.1 MB/s eta 0:00:01[K     |█████▉                          | 122 kB 5.1 MB/s eta 0:00:01[K     |██████▎                         | 133 kB 5.1 MB/s eta 0:00:0

In [None]:
import pandas as pd
import os
import numpy as np
import icd10
import pickle
import matplotlib.pyplot as plt

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Demo of icd10
code = icd10.find("R11")
print(code.description)         # Acute bronchitis due to Mycoplasma pneumoniae
if code.billable:
    print(code, "is billable")  # J20.0 is billable

print(code.chapter)             # X
print(code.block)               # J00-J99
print(code.block_description)   # Diseases of the respiratory system

Nausea and vomiting
XVIII
R00-R99
Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified


In [None]:
deep_learning_dir = '/content/gdrive/My Drive/BMI 707 Project' 

In [None]:
df_training = pd.read_pickle(deep_learning_dir + '/data_formatting/training_data.pickle')
df_val = pd.read_pickle(deep_learning_dir + '/data_formatting/validation_data.pickle')
df_testing = pd.read_pickle(deep_learning_dir + '/data_formatting/testing_data.pickle')

df_total = df_training.append(df_val)
df_total = df_total.append(df_testing)

print(str(df_training.shape), str(df_val.shape), str(df_testing.shape), str(df_total.shape))

(3094, 7) (344, 7) (1146, 7) (4584, 7)


# Inpute participant data

In [None]:
df = df_total.explode('icdcodes')
# Keep first ICD code
df['icdcodes'] = df['icdcodes'].apply(lambda x: x.split(".")[0])
df = df.drop_duplicates('nctid', keep='first')

In [None]:
def get_chapter(x): 
  code = icd10.find(x)
  desc = 'Other'

  try: 
    desc =  code.chapter
  except Exception: 
    pass 
  return desc

In [None]:
df['chapter'] = df['icdcodes'].apply(get_chapter)

In [None]:
df.chapter.unique()

array(['XVIII', 'V', 'VI', 'IV', 'XI', None, 'III', 'VII', 'XIII', 'II',
       'X', 'I', 'IX', 'XV', 'XXI', 'XII', 'XIV', 'XX', 'XIX', 'XVI',
       'XVII', 'Other', 'VIII'], dtype=object)

In [None]:
# Mapped as Other
df[df['chapter'] == 'Other']['icdcodes'].value_counts()

B00    44
O9A     6
J00     2
C7A     1
O00     1
Name: icdcodes, dtype: int64

In [None]:
# Mapped as na are all in the neoplasm chapter
df[df['chapter'].isna()]['icdcodes'].value_counts()

C79    158
C78    102
C61     54
C90     40
C95     31
C91     20
C76     16
C96     16
C92     15
C57     11
K94     10
C49     10
C71      6
C73      5
C83      4
C67      3
C88      2
C84      1
C81      1
Name: icdcodes, dtype: int64

In [None]:
# hand curation
df.loc[df['icdcodes'] == 'B00', 'chapter'] = 'I'
df.loc[df['chapter'].isna(), 'chapter'] = 'II'
df.loc[df['icdcodes'] == 'C7A', 'chapter'] = 'II'
df.loc[df['icdcodes'] == 'J00', 'chapter'] = 'X'
df.loc[df['icdcodes'] == 'K94', 'chapter'] = 'XI'
df.loc[df['icdcodes'] == 'O00', 'chapter'] = 'XV'
df.loc[df['icdcodes'] == 'O9A', 'chapter'] = 'XV'

In [None]:
# number of trials with missing n_participants info
sum(df['n_participants'].isna()) / len(df)

0.3706369982547993

In [None]:
#unique_chapter = df.chapter.unique()
#
#for x in unique_chapter: 
#  df[df.chapter == x]['n_participants'].hist(bins=30)
#  plt.title('Chapter %s' %x)
#  plt.show()

In [None]:
# Impute the number of participants per chapter with the median due to skewedness
df['n_participants'] = df['n_participants'].fillna(df.groupby('chapter')['n_participants'].transform('median'))

In [None]:
df.head()

Unnamed: 0,nctid,n_participants,drugs,diseases,icdcodes,criteria,label,chapter
0,NCT00475085,944.0,"[aprepitant, dexamethasone, granisetron hydroc...",[nausea],R11,\n Inclusion criteria:\n\n - ...,1,XVIII
1,NCT01626859,152.0,"[mp-214 low dose, mp-214 middle dose, mp-214 h...",[schizophrenia],F20,\n Inclusion Criteria:\n\n - ...,1,V
2,NCT00203957,2605.0,"[istradefylline, istradefylline]",[parkinsons disease],G20,\n Inclusion Criteria:\n\n - ...,1,VI
3,NCT00169832,3204.0,[rosiglitazone or placebo],"[diabetes, coronary artery bypass grafting]",E23,\n Inclusion Criteria:\n\n AT SC...,0,IV
4,NCT01249352,1958.0,"[nimotuzumab, cisplatin, fluorouracil]","[esophageal cancer, adenocarcinoma]",K22,\n Inclusion Criteria:\n\n 1. ...,1,XI


In [None]:
for chapter in df.chapter.unique():
  n_part_chapter = df[df["chapter"] == chapter]["n_participants"]
  df.loc[df["chapter"] == chapter, "norm_n_participants"] = (n_part_chapter - np.median(n_part_chapter)) /  np.median(n_part_chapter)

In [None]:
df.head()

Unnamed: 0,nctid,n_participants,drugs,diseases,icdcodes,criteria,label,chapter,norm_n_participants
0,NCT00475085,944.0,"[aprepitant, dexamethasone, granisetron hydroc...",[nausea],R11,\n Inclusion criteria:\n\n - ...,1,XVIII,-0.477298
1,NCT01626859,152.0,"[mp-214 low dose, mp-214 middle dose, mp-214 h...",[schizophrenia],F20,\n Inclusion Criteria:\n\n - ...,1,V,-0.930211
2,NCT00203957,2605.0,"[istradefylline, istradefylline]",[parkinsons disease],G20,\n Inclusion Criteria:\n\n - ...,1,VI,0.0
3,NCT00169832,3204.0,[rosiglitazone or placebo],"[diabetes, coronary artery bypass grafting]",E23,\n Inclusion Criteria:\n\n AT SC...,0,IV,0.0
4,NCT01249352,1958.0,"[nimotuzumab, cisplatin, fluorouracil]","[esophageal cancer, adenocarcinoma]",K22,\n Inclusion Criteria:\n\n 1. ...,1,XI,0.0


# Trial success data

In [None]:
# compute the success probability per ICD chapter
trial_success = df.groupby(['chapter']).agg(total_trials=('nctid', np.size),
                                            successful_trial=('label', np.sum))
trial_success['probability_success'] = trial_success['successful_trial'] / trial_success['total_trials']
trial_success = trial_success['probability_success']

In [None]:
df = df.merge(trial_success, on='chapter', how='left')
df.head()

Unnamed: 0,nctid,n_participants,drugs,diseases,icdcodes,criteria,label,chapter,norm_n_participants,probability_success
0,NCT00475085,944.0,"[aprepitant, dexamethasone, granisetron hydroc...",[nausea],R11,\n Inclusion criteria:\n\n - ...,1,XVIII,-0.477298,0.538462
1,NCT01626859,152.0,"[mp-214 low dose, mp-214 middle dose, mp-214 h...",[schizophrenia],F20,\n Inclusion Criteria:\n\n - ...,1,V,-0.930211,0.716814
2,NCT00203957,2605.0,"[istradefylline, istradefylline]",[parkinsons disease],G20,\n Inclusion Criteria:\n\n - ...,1,VI,0.0,0.660465
3,NCT00169832,3204.0,[rosiglitazone or placebo],"[diabetes, coronary artery bypass grafting]",E23,\n Inclusion Criteria:\n\n AT SC...,0,IV,0.0,0.854633
4,NCT01249352,1958.0,"[nimotuzumab, cisplatin, fluorouracil]","[esophageal cancer, adenocarcinoma]",K22,\n Inclusion Criteria:\n\n 1. ...,1,XI,0.0,0.611765


In [None]:
final_data = {nctid: np.array([df["norm_n_participants"][i], df["probability_success"][i]]) for i,nctid in enumerate(df["nctid"])}

In [None]:
with open("nctid2npart_success.pkl", 'wb') as handle:
    pickle.dump(final_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
!mv nctid2npart_success.pkl "/content/gdrive/My Drive/BMI 707 Project/embeddings/"