Clinical_Trial_Outcome_Pr / Git / [c87959] /benchmark/utils.py

Models:
joseph-gordon/
Clinical_Trial_Outcome_Pr
Downloads: 1
[c87959]: / benchmark / utils.py
History
Download this file
137 lines (109 with data), 3.7 kB

###### import ######
import numpy as np
try:
	from rdkit import Chem 
	from rdkit.Chem import AllChem
except:
	pass 
###### import ######


### tricky
def normalize_disease(name):
    name = name.lower()
    if 'lymphoma' in name:
      return 'lymphoma'
    name = name.replace(',', '')
    name = name.replace('(', ' ')
    name = name.replace(')', ' ')

    name = name.replace('cancer', 'neoplasm')
    name = name.replace('neoplasms', 'neoplasm')
    name = name.replace('tumors', 'tumor')

    name = name.replace('infections', 'infection')
    name = name.replace('diseases', 'disease')
    name = name.replace('disorders', 'disorder')
    name = name.replace('syndromes', 'syndrome')

    name = ' '.join(name.split())
    if name.split()[0]=='stage':
      name = ' '.join(name.split()[2:])

    name_lst = [name]
    if ' neoplasm' in name:
      # print(name)
      name_lst.append(name.replace('neoplasm', 'tumor'))
      name_split = name.split()
      idx = name_split.index('neoplasm')
      name2 = name_split[idx-1] + ' ' + name_split[idx]
      name_lst.append(name2)
    if ' tumor' in name:
      name_lst.append(name.replace('tumor', 'neoplasm'))
      name_split = name.split()
      idx = name_split.index('tumor')
      name2 = name_split[idx-1] + ' ' + name_split[idx]
      name_lst.append(name2)
    if 'disease' in name:
      name_lst.append(name.replace('disease', '').strip())
    if 'disorder' in name:
      name_lst.append(name.replace('disorder', '').strip())
    if '-related' in name:
      name_lst.append(name.replace('-related', '').strip())
    if 'syndrome' in name:
      name_lst.append(name.replace('syndrome', '').strip())


    if 'lung' in name and 'carcinoma' in name:
      name_lst.append('lung carcinoma')
    elif 'cell' in name and 'carcinoma' in name:
      name_lst.append('cell carcinoma')
    elif 'carcinoma' in name:
      name_lst.append('carcinoma')



    ## approximation 1	very few
    organ = ['liver', 'kidney', 'cardio', 'renal', 'hiv']
    for word in organ:
      if word in name:
        name_lst.append(word)

    # approximation 2 most 20%
    word_lst = sorted([(word, len(word)) for word in name.split()], key = lambda x:x[1], reverse = True)
    for word, cnt in word_lst:
      if cnt < 8:
        break
      name_lst.append(word)

    return name_lst








def get_icd_from_nih(name):
    prefix = 'https://clinicaltables.nlm.nih.gov/api/icd10cm/v3/search?sf=code,name&terms='
    name_lst = normalize_disease(name)
    for name in name_lst:
        url = prefix + name
        response = requests.get(url)
        text = response.text
        if text == '[0,[],null,[]]':
          continue
        text = text[1:-1]
        idx1 = text.find('[')
        idx2 = text.find(']')
        codes = text[idx1+1:idx2].split(',')
        codes = [i[1:-1] for i in codes]
        return codes
    return None

def walkData(root_node, prefix, result_list):

    temp_list = [prefix + '/' + root_node.tag, root_node.text]
    result_list.append(temp_list)

    for child in root_node:
        walkData(child, prefix=prefix + '/' + root_node.tag, result_list=result_list)



def root2outcome(root):
    result_list = []
    walkData(root, prefix = '', result_list = result_list)
    filter_func = lambda x:'p_value' in x[0]
    outcome_list = list(filter(filter_func, result_list))
    if len(outcome_list)==0:
      return None
    outcome = outcome_list[0][1]
    if outcome[0]=='<':
      return 1
    if outcome[0]=='>':
      return 0
    if outcome[0]=='=':
      outcome = outcome[1:]
    try:
      label = float(outcome)
      if label < 0.05:
        return 1
      else:
        return 0
    except:
      return None