Switch to unified view

a b/src/Parser/biomedner_engine.py
1
import random
2
import requests
3
import os
4
import string
5
import numpy as np
6
import hashlib
7
import time
8
import shutil
9
import asyncio
10
import socket
11
import struct
12
import json
13
import sys
14
from datetime import datetime
15
from collections import OrderedDict
16
import traceback
17
import bioregistry
18
19
HERE = os.path.dirname(os.path.abspath(__file__))
20
sys.path.insert(0, HERE)
21
22
class RunBioMedNER():
23
    def __init__(self, 
24
        biomedner_home,
25
        biomedner_port,
26
        biomedner_host='localhost',
27
        time_format='[%d/%b/%Y %H:%M:%S.%f]',
28
        max_word_len=50, 
29
        seed=2019,
30
        keep_files=False,
31
        no_cuda=False):
32
33
        self.time_format = time_format
34
35
        print(datetime.now().strftime(self.time_format), 'BioMedNER LOADING..')
36
        random.seed(seed)
37
        np.random.seed(seed)
38
39
        if not os.path.exists('./output'):
40
            os.mkdir('output')
41
42
        # delete prev. version outputs
43
        if not keep_files:
44
            delete_files('./output')
45
            delete_files(os.path.join('./multi_ner', 'input'))
46
            delete_files(os.path.join('./multi_ner', 'tmp'))
47
            delete_files(os.path.join('./multi_ner', 'output'))
48
49
        self.biomedner_home = biomedner_home
50
        self.biomedner_host = biomedner_host
51
        self.biomedner_port = biomedner_port
52
53
        self.max_word_len = max_word_len
54
55
        print(datetime.now().strftime(self.time_format), 'BioMedNER LOADED..')
56
    
57
    def annotate_text(self, text, pmid=None):
58
        try:
59
            text = text.strip()
60
            base_name = self.generate_base_name(text) # for the name of temporary files
61
            text = self.preprocess_input(text, base_name)
62
            output = self.tag_entities(text, base_name)
63
            output['error_code'], output['error_message'] = 0, ""
64
        except Exception as e:
65
            errStr = traceback.format_exc()
66
            print(errStr)
67
68
            output = {"error_code": 1, "error_message": "Something went wrong. Try again."}
69
70
        return output
71
72
73
    def preprocess_input(self, text, base_name):
74
        if '\r\n' in text:
75
            print(datetime.now().strftime(self.time_format),
76
                  f'[{base_name}] Found a CRLF -> replace it w/ a space')
77
            text = text.replace('\r\n', ' ')
78
79
        if '\n' in text:
80
            print(datetime.now().strftime(self.time_format),
81
                  f'[{base_name}] Found a line break -> replace it w/ a space')
82
            text = text.replace('\n', ' ')
83
84
        if '\t' in text:
85
            print(datetime.now().strftime(self.time_format),
86
                  f'[{base_name}] Found a tab -> replace w/ a space')
87
            text = text.replace('\t', ' ')
88
89
        if '\xa0' in text:
90
            print(datetime.now().strftime(self.time_format),
91
                  f'[{base_name}] Found a \\xa0 -> replace w/ a space')
92
            text = text.replace('\xa0', ' ')
93
94
        if '\x0b' in text:
95
            print(datetime.now().strftime(self.time_format),
96
                  f'[{base_name}] Found a \\x0b -> replace w/ a space')
97
            text = text.replace('\x0b', ' ')
98
            
99
        if '\x0c' in text:
100
            print(datetime.now().strftime(self.time_format),
101
                  f'[{base_name}] Found a \\x0c -> replace w/ a space')
102
            text = text.replace('\x0c', ' ')
103
        
104
        # remove non-ascii
105
        text = text.encode("ascii", "ignore").decode()
106
107
        found_too_long_words = 0
108
        tokens = text.split(' ')
109
        for idx, tk in enumerate(tokens):
110
            if len(tk) > self.max_word_len:
111
                tokens[idx] = tk[:self.max_word_len]
112
                found_too_long_words += 1
113
        if found_too_long_words > 0:
114
            print(datetime.now().strftime(self.time_format),
115
                  f'[{base_name}] Found a too long word -> cut the suffix of the word')
116
            text = ' '.join(tokens)
117
118
        return text
119
120
    def tag_entities(self, text, base_name):
121
        n_ascii_letters = 0
122
        for l in text:
123
            if l not in string.ascii_letters:
124
                continue
125
            n_ascii_letters += 1
126
127
        if n_ascii_letters == 0:
128
            text = 'No ascii letters. Please enter your text in English.'
129
130
        base_name = self.generate_base_name(text)
131
        print(datetime.now().strftime(self.time_format),
132
              f'id: {base_name}')
133
134
        pubtator_file = f'{base_name}.PubTator'
135
136
        input_biomedner = os.path.join(self.biomedner_home, 'input',
137
                                     f'{pubtator_file}.PubTator')
138
        output_biomedner = os.path.join(self.biomedner_home, 'output',
139
                                     f'{pubtator_file}.json')
140
141
        if not os.path.exists(self.biomedner_home + '/input'):
142
            os.mkdir(self.biomedner_home + '/input')
143
        if not os.path.exists(self.biomedner_home + '/output'):
144
            os.mkdir(self.biomedner_home + '/output')
145
146
        # Write input str to a .PubTator format file
147
        with open(input_biomedner, 'w', encoding='utf-8') as f:
148
            # only abstract
149
            f.write(f'{base_name}|t|\n')
150
            f.write(f'{base_name}|a|{text}\n\n')
151
152
        ner_start_time = time.time()
153
        
154
        arguments_for_coroutines = []
155
        loop = asyncio.new_event_loop()
156
        for ner_type in ['biomedner']:
157
            arguments_for_coroutines.append([ner_type, pubtator_file, output_biomedner, base_name, loop])
158
        async_result = loop.run_until_complete(self.async_ner(arguments_for_coroutines))
159
        loop.close()
160
        biomedner_elapse_time = async_result['biomedner_elapse_time']
161
162
        # get output result to merge
163
        tagged_docs = async_result['tagged_docs']
164
        num_entities = async_result['num_entities']
165
        
166
        ner_elapse_time = time.time() - ner_start_time
167
        print(datetime.now().strftime(self.time_format),
168
              f'[{base_name}] ALL NER {ner_elapse_time} sec')
169
170
        # time record
171
        tagged_docs[0]['elapse_time'] = {
172
            'biomedner_elapse_time':biomedner_elapse_time,
173
            'ner_elapse_time': ner_elapse_time,
174
        } 
175
176
        # Delete temp files
177
        os.remove(input_biomedner)
178
        os.remove(output_biomedner)
179
        
180
        return tagged_docs[0]
181
182
    # generate id for temporary files
183
    def generate_base_name(self, text):
184
        # add time.time() to avoid collision
185
        base_name = hashlib.sha224((text+str(time.time())).encode('utf-8')).hexdigest()
186
        return base_name
187
188
    async def async_ner(self, arguments):
189
        coroutines = [self._ner_wrap(*arg) for arg in arguments]
190
        result = await asyncio.gather(*coroutines)
191
        result = {k:v for e in result for k,v in e.items()} # merge
192
        return result
193
194
    async def _ner_wrap(self, ner_type, pubtator_file, output_biomedner, base_name, loop):
195
        if ner_type == 'biomedner':            
196
            # Run neural model
197
            start_time = time.time()
198
            biomedner_resp = await async_tell_inputfile(self.biomedner_host,
199
                                         self.biomedner_port,
200
                                         pubtator_file,
201
                                         loop)
202
            
203
            with open(output_biomedner, 'r', encoding='utf-8') as f:
204
                tagged_docs = [json.load(f)]
205
206
            num_entities = tagged_docs[0]['num_entities']
207
            if tagged_docs is None:
208
                return None
209
210
            assert len(tagged_docs) == 1
211
            biomedner_elapse_time = time.time() - start_time
212
            print(datetime.now().strftime(self.time_format),
213
                f'[{base_name}] Multi-task NER {biomedner_elapse_time} sec, #entities: {num_entities}')
214
215
            return {"biomedner_elapse_time": biomedner_elapse_time,
216
                    "tagged_docs": tagged_docs,
217
                    "num_entities": num_entities}
218
219
async def async_tell_inputfile(host, port, inputfile, loop):
220
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
221
    sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
222
    try:
223
        sock.connect((host, port))
224
        input_str = inputfile
225
        input_stream = struct.pack('>H', len(input_str)) + input_str.encode(
226
            'utf-8')
227
        sock.send(input_stream)
228
        # output_stream = sock.recv(512)
229
        output_stream = await loop.run_in_executor(None, sock.recv, 512) # for async
230
        resp = output_stream.decode('utf-8')[2:]
231
232
        sock.close()
233
        return resp
234
    except ConnectionRefusedError as e:
235
        print(e)
236
        return None
237
    except TimeoutError as e:
238
        print(e)
239
        return None
240
    except ConnectionResetError as e:
241
        print(e)
242
        return None
243
244
def sync_tell_inputfile(host, port, inputfile):
245
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
246
    sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1)
247
    try:
248
        sock.connect((host, port))
249
        input_str = inputfile
250
        input_stream = struct.pack('>H', len(input_str)) + input_str.encode(
251
            'utf-8')
252
        sock.send(input_stream)
253
        output_stream = sock.recv(512) # for sync
254
        # output_stream = await loop.run_in_executor(None, sock.recv, 512)
255
        resp = output_stream.decode('utf-8')[2:]
256
257
        sock.close()
258
        return resp
259
    except ConnectionRefusedError as e:
260
        print(e)
261
        return None
262
    except TimeoutError as e:
263
        print(e)
264
        return None
265
    except ConnectionResetError as e:
266
        print(e)
267
        return None
268
269
def delete_files(dirname):
270
    if not os.path.exists(dirname):
271
        return
272
273
    n_deleted = 0
274
    for f in os.listdir(dirname):
275
        f_path = os.path.join(dirname, f)
276
        if not os.path.isfile(f_path):
277
            continue
278
        # print('Delete', f_path)
279
        os.remove(f_path)
280
        n_deleted += 1
281
    print(dirname, n_deleted)
282
283
if __name__ == '__main__':
284
    import argparse
285
286
    argparser = argparse.ArgumentParser()
287
    argparser.add_argument('--max_word_len', type=int, help='word max chars',
288
                           default=50)
289
    argparser.add_argument('--seed', type=int, help='seed value', default=2019)
290
291
    argparser.add_argument('--biomedner_home',
292
                           help='biomedical language model home',
293
                           default=os.path.join(os.path.expanduser('~'),
294
                                                'biomedner', 'biomednerHome'))
295
    argparser.add_argument('--biomedner_host',
296
                           help='biomedical language model host', default='localhost')
297
    argparser.add_argument('--biomedner_port', type=int, 
298
                           help='biomedical language model port', default=18894)
299
    argparser.add_argument('--time_format',
300
                           help='time format', default='[%d/%b/%Y %H:%M:%S.%f]')
301
    argparser.add_argument("--keep_files", action="store_true")
302
    argparser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available")
303
    args = argparser.parse_args()
304
    biomedner = RunBioMedNER(
305
        max_word_len=args.max_word_len,
306
        seed=args.seed,
307
        biomedner_home=args.biomedner_home,
308
        biomedner_host=args.biomedner_host,
309
        biomedner_port=args.biomedner_port,
310
        time_format=args.time_format,
311
        keep_files=args.keep_files,
312
        no_cuda=args.no_cuda,
313
    )
314
    text = "KRAS is a proto-oncogene involved in various cancers."
315
    result = biomedner.annotate_text(text.lower())
316
    print(result)