NER-Medical-Documents / Git / [9063a2] /patient_dosage

Models:
philipB/
NER-Medical-Documents
Downloads: 1
[9063a2]: / patient_dosage_pipeline.py
History
Download this file
287 lines (222 with data), 12.1 kB

from xml.dom.minidom import CharacterData
from allennlp.predictors.predictor import Predictor
from tqdm import tqdm
from flair.models import SequenceTagger
from flair.data import Sentence
from flair.tokenization import SciSpacyTokenizer
from quantulum3 import parser  # need to install module stemming
import allennlp_models.tagging
import os
import nltk
import re
import benepar
import spacy
import spacy_universal_sentence_encoder


benepar.download('benepar_en3')

nlp = spacy.load('en_core_web_md')       # python3 -m spacy download en_core_web_md
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})

nlp_2 = spacy_universal_sentence_encoder.load_model('en_use_lg')

def get_similarity(word1):

    doc_1 = nlp_2(word1)
    dummies = ['people with a disease', 'size', 'height', 'age of the patient', 'patient condition', 'old patient', 'pregnant people'
    ]
    result = 0
    for d in dummies:
        doc_2 = nlp_2(d)
        result = max(result, doc_1.similarity(doc_2))

    return result


class OpenIE:

    def __init__(self):
        self.predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")

    def predict(self, sentence):
        pred = self.predictor.predict(sentence=sentence)
        return pred


class Dosage():

    def __init__(self):
        # self.predictor = SequenceTagger.load("flair/ner-english-ontonotes-fast")
        pass
        

    def predict(self, sentence_):
        results = []
        # sentence = Sentence(sentence_, use_tokenizer=SciSpacyTokenizer())
        # self.predictor.predict(sentence)
        # for annotation_layer in sentence.annotation_layers.keys():
        #     for entity in sentence.get_spans(annotation_layer):
        #         if entity.tag  == 'QUANTITY': 
        #             results.append(entity.text)
        # return results

        quants = parser.parse(sentence_)
        for q in quants:
            print(q.unit.dimensions)
            if q.unit.dimensions[0]['base'] in ['milligram', 'milliliter', 'gram']:
                results.append(q.surface)
        return results





def intersection(li1, s2):
    for word in li1:
        if word in s2:
            return True
    return False



def oie(sentence, model_dosage, model_oie, patient=False, dosage=False):
    ''' 2 pipelines for patient characteristics extraction and dosage extraction '''
    pred = model_oie.predict(sentence)
    if patient:
        doc = nlp(sentence)
        sent = list(doc.sents)[0]
        found = False
        matching_patient = ['patient', 'patients', 'individual', 'individuals', 'people', 'man', 'woman', 'men', 'women', 'male', 'female']
        spans = list(sent._.constituents)
        for x in spans:
            all_parents = []
            if x.text.strip().lower() in matching_patient:
                print('OK ', x)
                parent = x._.parent
                all_parents.append(parent)
                # found_but_no_characteristics = True
            if len(all_parents) > 0:
                print('PARENTS', all_parents)
                parent = min(all_parents, key=lambda x: len(x.text))
                print(list(parent._.children))
                for y in parent._.children:
                    y_label = y._.parse_string.split(' ')[0][1:]
                    if y_label not in ['DT', 'NN', 'NNS', 'IN']: # means you have more than just 'the patient(s)'
                        found = True
                        characteristics = parent.text
                        break

        if found:
            print('SIMILARITY 1: ', get_similarity(characteristics))
            if get_similarity(characteristics) > 0.35:
                return characteristics
            else:
                return None


        found = False
        matching_patient = ['patient', 'patients', 'individual', 'individuals', 'people', 'man', 'woman', 'men', 'women', 'male', 'female']
        matching_characteristics = ['young', 'male', 'female', 'elderly', 'years', 'old', 'with']
        for x in pred['verbs']:
            desc = x['description']
            print(desc)
            args = re.findall('\[ARG.*?\]', desc)
            for arg in args:
                if arg.startswith('[ARG1'):
                    for t in matching_patient:
                        if t in arg:
                            # characteristics = arg
                            # found = True
                            # break
                            if intersection(matching_characteristics, arg):
                                return ' '.join(arg.split(' ')[1:])[:-1]
                            else:
                                for arg_preced in args:
                                    if arg_preced.startswith('[ARG0') or arg_preced.startswith('[ARG2'):
                                        if intersection(matching_characteristics, arg_preced):
                                            characteristics = arg_preced
                                            found= True
                                            break
        if found:
            print(characteristics)
            characteristics = ' '.join(characteristics.split(' ')[1:])[:-1]
            print('SIMILARITY 2: ', get_similarity(characteristics))
            if get_similarity(characteristics) > 0.35:
                return characteristics
        else:
            return None
                            
                        
    if dosage:
        found = False
        matching_text = model_dosage.predict(sentence)
        for x in pred['verbs']:
            desc = x['description']
            args = re.findall('\[ARG.*?\]', desc)
            for arg in args:
                found_drug = False
                if arg.startswith('[ARGM-MNR') or arg.startswith('[ARGM-EXT') or arg.startswith('[ARGM-TMP'):
                    for t in matching_text:
                        if t.replace(' ', '') in arg.replace(' ', ''):
                            found = True
                            for arg_preced in args:
                                if arg_preced.startswith('[ARG0') or arg_preced.startswith('[ARG1') or arg_preced.startswith('[ARG2'):
                                    found_drug = True
                                    drug = arg_preced
                                    break
                            couple = (' '.join(drug.split(' ')[1:])[:-1], t) if found_drug else (None, t)
        return couple if found else (None, None)



        

def pipeline(text, window_size = 2):
    model_dosage = Dosage()
    model_oie = OpenIE()

    results_patient = []
    results_dosage = []

    tokenizer_split_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
    model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz'
    predictor = Predictor.from_path(model_url)

    paragraphs = text.split('\n\n')
    sentences = []
    for p in paragraphs:
        sentences.extend(tokenizer_split_sentences.tokenize(p))
        sentences[-1] = sentences[-1]+ '\n\n'  

    list_coref = [sentences[i] for i in range(window_size)]
    try:
        transformed_chunk = predictor.coref_resolved(' '.join(list_coref))
        paragraphs_transformed = transformed_chunk.split('\n\n')
        sentences_transformed_init = []
        for p in paragraphs_transformed:
            sentences_transformed_init.extend(tokenizer_split_sentences.tokenize(p))
        print('SENTENCES_TRANSFORMED_INIT', sentences_transformed_init)

    except:
        sentences_transformed_init = list_coref
        pass

    all_transformed_sentences = []
    all_sentences = []
    for s in tqdm(range(len(sentences))):

        if s >= window_size:
            sentence = sentences[s]
            list_coref.append(sentence)
            list_coref.pop(0)
            try:
                transformed_chunk = predictor.coref_resolved(' '.join(list_coref))
                paragraphs_transformed = transformed_chunk.split('\n\n')
                for p in paragraphs_transformed:
                    if p != '':
                        sentences_transformed = tokenizer_split_sentences.tokenize(p)
                sentence_transformed = sentences_transformed[-1]
            except:
                sentence_transformed = sentence
                pass

        else:
            try:
                sentence_transformed = sentences_transformed_init[s]
            except:
                print(s)
                print(sentences_transformed_init)
                print(list_coref)
                print(sentences)

                sentence_transformed = sentences_transformed_init[0]       # TO BE CHANGED, JUST TO AVOID ERROR
                

        all_transformed_sentences.append(sentence_transformed)
        all_sentences.append(sentences[s])
        # Open IE

        # For patient information 

        results_patient.append(oie(sentence_transformed, model_dosage, model_oie,  patient=True))

        # For dosage information

        results_dosage.append(oie(sentence_transformed, model_dosage, model_oie, dosage=True))



    return results_patient, results_dosage, all_transformed_sentences, all_sentences




# propagate transformed sentences maybe? (but would also propagate the errors...)


if __name__ == '__main__':
    text = "Aspirin is most of the time an innocuous drug. There are 2 cats in the kitchen. But it can have deleterious effect if it is administered with a dosage of more than 1000 mg. Some adverse effects are exacerbated for some patients. For instance, if they are more than 70 years old, effects will be more pronounced."
    text2 = "Some studies have proven the impact of cytoxin on young female patients. Indeed, it can cause nose bleeding when 500mg or more if taken by the patient. This a test sentence. "
    text3 = "In this paper we try to prove how chloroquine can generate negative effects on elderly people. It can in particular causes stomach pain when the maximum 1000mg dosage is not respected. Finally we show that it has no or little effect on younger patients even when they take more than 1000mg."
    text4 = "Patients with hemoglobin E beta-thalassemia, a severe form of the disease, were found to have impaired hepcidin function and higher TfR1 levels as a result of an increased erythropoietic drive stemming from the continuously failing erythropoiesis that is caused by improper hemoglobin production [56]."
    text5 = "Another study observed a 3-year disease free survival rate of 80 percent, and an overall survival rate of 82 percent in cervical cancer patients."
    text6 = "A study reported that individuals with hereditary hemochromatosis exhibit an increased risk for developing cancer, particularly in the liver and primarily hepatocellular carcinoma as opposed to biliary tract related cancers."
    text7 = "Due to oxygen's atomic nature, its reduction must proceed in a stepwise fashion of individual electron additions and reactive intermediates."
    text8 = "Chloroquine analogues have also been found to have metabolic, antithrombotic, antineoplastic, and antiviral effects, and have been hypothesized as targeted agents against coronavirus infection since the 2003 SARS outbreak [25,26]."
    results_patient, result_dosage, all_transformed_sentences, all_sentences = pipeline(text + '\n' + text2 + '\n' + text3 + '\n' + text6 + '\n' + text7 + '\n' + text8, window_size=3)

    for i in range(len(all_transformed_sentences)):
        print('\n \n')
        print('Original sentence: ', all_sentences[i].strip())
        print('\n')
        print('Transformed sentence: ', all_transformed_sentences[i].strip())
        print('\n')
        print('Detected patients: ', results_patient[i])
        print('\n')
        print('Detected dosages: ', result_dosage[i])
        print('\n')
        print('=============================================')

        
    # print(get_similarity('for elderly people'))
    # print(get_similarity('patients more than 70 years old'))
    # print(get_similarity('individual electrons'))