Switch to unified view

a b/patient_dosage_pipeline.py
1
from xml.dom.minidom import CharacterData
2
from allennlp.predictors.predictor import Predictor
3
from tqdm import tqdm
4
from flair.models import SequenceTagger
5
from flair.data import Sentence
6
from flair.tokenization import SciSpacyTokenizer
7
from quantulum3 import parser  # need to install module stemming
8
import allennlp_models.tagging
9
import os
10
import nltk
11
import re
12
import benepar
13
import spacy
14
import spacy_universal_sentence_encoder
15
16
17
benepar.download('benepar_en3')
18
19
nlp = spacy.load('en_core_web_md')       # python3 -m spacy download en_core_web_md
20
nlp.add_pipe('benepar', config={'model': 'benepar_en3'})
21
22
nlp_2 = spacy_universal_sentence_encoder.load_model('en_use_lg')
23
24
def get_similarity(word1):
25
26
    doc_1 = nlp_2(word1)
27
    dummies = ['people with a disease', 'size', 'height', 'age of the patient', 'patient condition', 'old patient', 'pregnant people'
28
    ]
29
    result = 0
30
    for d in dummies:
31
        doc_2 = nlp_2(d)
32
        result = max(result, doc_1.similarity(doc_2))
33
34
    return result
35
36
37
class OpenIE:
38
39
    def __init__(self):
40
        self.predictor = Predictor.from_path("https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz")
41
42
    def predict(self, sentence):
43
        pred = self.predictor.predict(sentence=sentence)
44
        return pred
45
46
47
class Dosage():
48
49
    def __init__(self):
50
        # self.predictor = SequenceTagger.load("flair/ner-english-ontonotes-fast")
51
        pass
52
        
53
54
    def predict(self, sentence_):
55
        results = []
56
        # sentence = Sentence(sentence_, use_tokenizer=SciSpacyTokenizer())
57
        # self.predictor.predict(sentence)
58
        # for annotation_layer in sentence.annotation_layers.keys():
59
        #     for entity in sentence.get_spans(annotation_layer):
60
        #         if entity.tag  == 'QUANTITY': 
61
        #             results.append(entity.text)
62
        # return results
63
64
        quants = parser.parse(sentence_)
65
        for q in quants:
66
            print(q.unit.dimensions)
67
            if q.unit.dimensions[0]['base'] in ['milligram', 'milliliter', 'gram']:
68
                results.append(q.surface)
69
        return results
70
71
72
73
74
75
def intersection(li1, s2):
76
    for word in li1:
77
        if word in s2:
78
            return True
79
    return False
80
81
82
83
def oie(sentence, model_dosage, model_oie, patient=False, dosage=False):
84
    ''' 2 pipelines for patient characteristics extraction and dosage extraction '''
85
    pred = model_oie.predict(sentence)
86
    if patient:
87
        doc = nlp(sentence)
88
        sent = list(doc.sents)[0]
89
        found = False
90
        matching_patient = ['patient', 'patients', 'individual', 'individuals', 'people', 'man', 'woman', 'men', 'women', 'male', 'female']
91
        spans = list(sent._.constituents)
92
        for x in spans:
93
            all_parents = []
94
            if x.text.strip().lower() in matching_patient:
95
                print('OK ', x)
96
                parent = x._.parent
97
                all_parents.append(parent)
98
                # found_but_no_characteristics = True
99
            if len(all_parents) > 0:
100
                print('PARENTS', all_parents)
101
                parent = min(all_parents, key=lambda x: len(x.text))
102
                print(list(parent._.children))
103
                for y in parent._.children:
104
                    y_label = y._.parse_string.split(' ')[0][1:]
105
                    if y_label not in ['DT', 'NN', 'NNS', 'IN']: # means you have more than just 'the patient(s)'
106
                        found = True
107
                        characteristics = parent.text
108
                        break
109
110
        if found:
111
            print('SIMILARITY 1: ', get_similarity(characteristics))
112
            if get_similarity(characteristics) > 0.35:
113
                return characteristics
114
            else:
115
                return None
116
117
118
        found = False
119
        matching_patient = ['patient', 'patients', 'individual', 'individuals', 'people', 'man', 'woman', 'men', 'women', 'male', 'female']
120
        matching_characteristics = ['young', 'male', 'female', 'elderly', 'years', 'old', 'with']
121
        for x in pred['verbs']:
122
            desc = x['description']
123
            print(desc)
124
            args = re.findall('\[ARG.*?\]', desc)
125
            for arg in args:
126
                if arg.startswith('[ARG1'):
127
                    for t in matching_patient:
128
                        if t in arg:
129
                            # characteristics = arg
130
                            # found = True
131
                            # break
132
                            if intersection(matching_characteristics, arg):
133
                                return ' '.join(arg.split(' ')[1:])[:-1]
134
                            else:
135
                                for arg_preced in args:
136
                                    if arg_preced.startswith('[ARG0') or arg_preced.startswith('[ARG2'):
137
                                        if intersection(matching_characteristics, arg_preced):
138
                                            characteristics = arg_preced
139
                                            found= True
140
                                            break
141
        if found:
142
            print(characteristics)
143
            characteristics = ' '.join(characteristics.split(' ')[1:])[:-1]
144
            print('SIMILARITY 2: ', get_similarity(characteristics))
145
            if get_similarity(characteristics) > 0.35:
146
                return characteristics
147
        else:
148
            return None
149
                            
150
                        
151
    if dosage:
152
        found = False
153
        matching_text = model_dosage.predict(sentence)
154
        for x in pred['verbs']:
155
            desc = x['description']
156
            args = re.findall('\[ARG.*?\]', desc)
157
            for arg in args:
158
                found_drug = False
159
                if arg.startswith('[ARGM-MNR') or arg.startswith('[ARGM-EXT') or arg.startswith('[ARGM-TMP'):
160
                    for t in matching_text:
161
                        if t.replace(' ', '') in arg.replace(' ', ''):
162
                            found = True
163
                            for arg_preced in args:
164
                                if arg_preced.startswith('[ARG0') or arg_preced.startswith('[ARG1') or arg_preced.startswith('[ARG2'):
165
                                    found_drug = True
166
                                    drug = arg_preced
167
                                    break
168
                            couple = (' '.join(drug.split(' ')[1:])[:-1], t) if found_drug else (None, t)
169
        return couple if found else (None, None)
170
171
172
173
        
174
175
def pipeline(text, window_size = 2):
176
    model_dosage = Dosage()
177
    model_oie = OpenIE()
178
179
    results_patient = []
180
    results_dosage = []
181
182
    tokenizer_split_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
183
    model_url = 'https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2021.03.10.tar.gz'
184
    predictor = Predictor.from_path(model_url)
185
186
    paragraphs = text.split('\n\n')
187
    sentences = []
188
    for p in paragraphs:
189
        sentences.extend(tokenizer_split_sentences.tokenize(p))
190
        sentences[-1] = sentences[-1]+ '\n\n'  
191
192
    list_coref = [sentences[i] for i in range(window_size)]
193
    try:
194
        transformed_chunk = predictor.coref_resolved(' '.join(list_coref))
195
        paragraphs_transformed = transformed_chunk.split('\n\n')
196
        sentences_transformed_init = []
197
        for p in paragraphs_transformed:
198
            sentences_transformed_init.extend(tokenizer_split_sentences.tokenize(p))
199
        print('SENTENCES_TRANSFORMED_INIT', sentences_transformed_init)
200
201
    except:
202
        sentences_transformed_init = list_coref
203
        pass
204
205
    all_transformed_sentences = []
206
    all_sentences = []
207
    for s in tqdm(range(len(sentences))):
208
209
        if s >= window_size:
210
            sentence = sentences[s]
211
            list_coref.append(sentence)
212
            list_coref.pop(0)
213
            try:
214
                transformed_chunk = predictor.coref_resolved(' '.join(list_coref))
215
                paragraphs_transformed = transformed_chunk.split('\n\n')
216
                for p in paragraphs_transformed:
217
                    if p != '':
218
                        sentences_transformed = tokenizer_split_sentences.tokenize(p)
219
                sentence_transformed = sentences_transformed[-1]
220
            except:
221
                sentence_transformed = sentence
222
                pass
223
224
        else:
225
            try:
226
                sentence_transformed = sentences_transformed_init[s]
227
            except:
228
                print(s)
229
                print(sentences_transformed_init)
230
                print(list_coref)
231
                print(sentences)
232
233
                sentence_transformed = sentences_transformed_init[0]       # TO BE CHANGED, JUST TO AVOID ERROR
234
                
235
236
        all_transformed_sentences.append(sentence_transformed)
237
        all_sentences.append(sentences[s])
238
        # Open IE
239
240
        # For patient information 
241
242
        results_patient.append(oie(sentence_transformed, model_dosage, model_oie,  patient=True))
243
244
        # For dosage information
245
246
        results_dosage.append(oie(sentence_transformed, model_dosage, model_oie, dosage=True))
247
248
249
250
    return results_patient, results_dosage, all_transformed_sentences, all_sentences
251
252
253
254
255
# propagate transformed sentences maybe? (but would also propagate the errors...)
256
257
258
if __name__ == '__main__':
259
    text = "Aspirin is most of the time an innocuous drug. There are 2 cats in the kitchen. But it can have deleterious effect if it is administered with a dosage of more than 1000 mg. Some adverse effects are exacerbated for some patients. For instance, if they are more than 70 years old, effects will be more pronounced."
260
    text2 = "Some studies have proven the impact of cytoxin on young female patients. Indeed, it can cause nose bleeding when 500mg or more if taken by the patient. This a test sentence. "
261
    text3 = "In this paper we try to prove how chloroquine can generate negative effects on elderly people. It can in particular causes stomach pain when the maximum 1000mg dosage is not respected. Finally we show that it has no or little effect on younger patients even when they take more than 1000mg."
262
    text4 = "Patients with hemoglobin E beta-thalassemia, a severe form of the disease, were found to have impaired hepcidin function and higher TfR1 levels as a result of an increased erythropoietic drive stemming from the continuously failing erythropoiesis that is caused by improper hemoglobin production [56]."
263
    text5 = "Another study observed a 3-year disease free survival rate of 80 percent, and an overall survival rate of 82 percent in cervical cancer patients."
264
    text6 = "A study reported that individuals with hereditary hemochromatosis exhibit an increased risk for developing cancer, particularly in the liver and primarily hepatocellular carcinoma as opposed to biliary tract related cancers."
265
    text7 = "Due to oxygen's atomic nature, its reduction must proceed in a stepwise fashion of individual electron additions and reactive intermediates."
266
    text8 = "Chloroquine analogues have also been found to have metabolic, antithrombotic, antineoplastic, and antiviral effects, and have been hypothesized as targeted agents against coronavirus infection since the 2003 SARS outbreak [25,26]."
267
    results_patient, result_dosage, all_transformed_sentences, all_sentences = pipeline(text + '\n' + text2 + '\n' + text3 + '\n' + text6 + '\n' + text7 + '\n' + text8, window_size=3)
268
269
    for i in range(len(all_transformed_sentences)):
270
        print('\n \n')
271
        print('Original sentence: ', all_sentences[i].strip())
272
        print('\n')
273
        print('Transformed sentence: ', all_transformed_sentences[i].strip())
274
        print('\n')
275
        print('Detected patients: ', results_patient[i])
276
        print('\n')
277
        print('Detected dosages: ', result_dosage[i])
278
        print('\n')
279
        print('=============================================')
280
281
        
282
    # print(get_similarity('for elderly people'))
283
    # print(get_similarity('patients more than 70 years old'))
284
    # print(get_similarity('individual electrons'))
285
286