|
a |
|
b/src/Parser/biomedner_engine.py |
|
|
1 |
import random |
|
|
2 |
import requests |
|
|
3 |
import os |
|
|
4 |
import string |
|
|
5 |
import numpy as np |
|
|
6 |
import hashlib |
|
|
7 |
import time |
|
|
8 |
import shutil |
|
|
9 |
import asyncio |
|
|
10 |
import socket |
|
|
11 |
import struct |
|
|
12 |
import json |
|
|
13 |
import sys |
|
|
14 |
from datetime import datetime |
|
|
15 |
from collections import OrderedDict |
|
|
16 |
import traceback |
|
|
17 |
import bioregistry |
|
|
18 |
|
|
|
19 |
HERE = os.path.dirname(os.path.abspath(__file__)) |
|
|
20 |
sys.path.insert(0, HERE) |
|
|
21 |
|
|
|
22 |
class RunBioMedNER(): |
|
|
23 |
def __init__(self, |
|
|
24 |
biomedner_home, |
|
|
25 |
biomedner_port, |
|
|
26 |
biomedner_host='localhost', |
|
|
27 |
time_format='[%d/%b/%Y %H:%M:%S.%f]', |
|
|
28 |
max_word_len=50, |
|
|
29 |
seed=2019, |
|
|
30 |
keep_files=False, |
|
|
31 |
no_cuda=False): |
|
|
32 |
|
|
|
33 |
self.time_format = time_format |
|
|
34 |
|
|
|
35 |
print(datetime.now().strftime(self.time_format), 'BioMedNER LOADING..') |
|
|
36 |
random.seed(seed) |
|
|
37 |
np.random.seed(seed) |
|
|
38 |
|
|
|
39 |
if not os.path.exists('./output'): |
|
|
40 |
os.mkdir('output') |
|
|
41 |
|
|
|
42 |
# delete prev. version outputs |
|
|
43 |
if not keep_files: |
|
|
44 |
delete_files('./output') |
|
|
45 |
delete_files(os.path.join('./multi_ner', 'input')) |
|
|
46 |
delete_files(os.path.join('./multi_ner', 'tmp')) |
|
|
47 |
delete_files(os.path.join('./multi_ner', 'output')) |
|
|
48 |
|
|
|
49 |
self.biomedner_home = biomedner_home |
|
|
50 |
self.biomedner_host = biomedner_host |
|
|
51 |
self.biomedner_port = biomedner_port |
|
|
52 |
|
|
|
53 |
self.max_word_len = max_word_len |
|
|
54 |
|
|
|
55 |
print(datetime.now().strftime(self.time_format), 'BioMedNER LOADED..') |
|
|
56 |
|
|
|
57 |
def annotate_text(self, text, pmid=None): |
|
|
58 |
try: |
|
|
59 |
text = text.strip() |
|
|
60 |
base_name = self.generate_base_name(text) # for the name of temporary files |
|
|
61 |
text = self.preprocess_input(text, base_name) |
|
|
62 |
output = self.tag_entities(text, base_name) |
|
|
63 |
output['error_code'], output['error_message'] = 0, "" |
|
|
64 |
except Exception as e: |
|
|
65 |
errStr = traceback.format_exc() |
|
|
66 |
print(errStr) |
|
|
67 |
|
|
|
68 |
output = {"error_code": 1, "error_message": "Something went wrong. Try again."} |
|
|
69 |
|
|
|
70 |
return output |
|
|
71 |
|
|
|
72 |
|
|
|
73 |
def preprocess_input(self, text, base_name): |
|
|
74 |
if '\r\n' in text: |
|
|
75 |
print(datetime.now().strftime(self.time_format), |
|
|
76 |
f'[{base_name}] Found a CRLF -> replace it w/ a space') |
|
|
77 |
text = text.replace('\r\n', ' ') |
|
|
78 |
|
|
|
79 |
if '\n' in text: |
|
|
80 |
print(datetime.now().strftime(self.time_format), |
|
|
81 |
f'[{base_name}] Found a line break -> replace it w/ a space') |
|
|
82 |
text = text.replace('\n', ' ') |
|
|
83 |
|
|
|
84 |
if '\t' in text: |
|
|
85 |
print(datetime.now().strftime(self.time_format), |
|
|
86 |
f'[{base_name}] Found a tab -> replace w/ a space') |
|
|
87 |
text = text.replace('\t', ' ') |
|
|
88 |
|
|
|
89 |
if '\xa0' in text: |
|
|
90 |
print(datetime.now().strftime(self.time_format), |
|
|
91 |
f'[{base_name}] Found a \\xa0 -> replace w/ a space') |
|
|
92 |
text = text.replace('\xa0', ' ') |
|
|
93 |
|
|
|
94 |
if '\x0b' in text: |
|
|
95 |
print(datetime.now().strftime(self.time_format), |
|
|
96 |
f'[{base_name}] Found a \\x0b -> replace w/ a space') |
|
|
97 |
text = text.replace('\x0b', ' ') |
|
|
98 |
|
|
|
99 |
if '\x0c' in text: |
|
|
100 |
print(datetime.now().strftime(self.time_format), |
|
|
101 |
f'[{base_name}] Found a \\x0c -> replace w/ a space') |
|
|
102 |
text = text.replace('\x0c', ' ') |
|
|
103 |
|
|
|
104 |
# remove non-ascii |
|
|
105 |
text = text.encode("ascii", "ignore").decode() |
|
|
106 |
|
|
|
107 |
found_too_long_words = 0 |
|
|
108 |
tokens = text.split(' ') |
|
|
109 |
for idx, tk in enumerate(tokens): |
|
|
110 |
if len(tk) > self.max_word_len: |
|
|
111 |
tokens[idx] = tk[:self.max_word_len] |
|
|
112 |
found_too_long_words += 1 |
|
|
113 |
if found_too_long_words > 0: |
|
|
114 |
print(datetime.now().strftime(self.time_format), |
|
|
115 |
f'[{base_name}] Found a too long word -> cut the suffix of the word') |
|
|
116 |
text = ' '.join(tokens) |
|
|
117 |
|
|
|
118 |
return text |
|
|
119 |
|
|
|
120 |
def tag_entities(self, text, base_name): |
|
|
121 |
n_ascii_letters = 0 |
|
|
122 |
for l in text: |
|
|
123 |
if l not in string.ascii_letters: |
|
|
124 |
continue |
|
|
125 |
n_ascii_letters += 1 |
|
|
126 |
|
|
|
127 |
if n_ascii_letters == 0: |
|
|
128 |
text = 'No ascii letters. Please enter your text in English.' |
|
|
129 |
|
|
|
130 |
base_name = self.generate_base_name(text) |
|
|
131 |
print(datetime.now().strftime(self.time_format), |
|
|
132 |
f'id: {base_name}') |
|
|
133 |
|
|
|
134 |
pubtator_file = f'{base_name}.PubTator' |
|
|
135 |
|
|
|
136 |
input_biomedner = os.path.join(self.biomedner_home, 'input', |
|
|
137 |
f'{pubtator_file}.PubTator') |
|
|
138 |
output_biomedner = os.path.join(self.biomedner_home, 'output', |
|
|
139 |
f'{pubtator_file}.json') |
|
|
140 |
|
|
|
141 |
if not os.path.exists(self.biomedner_home + '/input'): |
|
|
142 |
os.mkdir(self.biomedner_home + '/input') |
|
|
143 |
if not os.path.exists(self.biomedner_home + '/output'): |
|
|
144 |
os.mkdir(self.biomedner_home + '/output') |
|
|
145 |
|
|
|
146 |
# Write input str to a .PubTator format file |
|
|
147 |
with open(input_biomedner, 'w', encoding='utf-8') as f: |
|
|
148 |
# only abstract |
|
|
149 |
f.write(f'{base_name}|t|\n') |
|
|
150 |
f.write(f'{base_name}|a|{text}\n\n') |
|
|
151 |
|
|
|
152 |
ner_start_time = time.time() |
|
|
153 |
|
|
|
154 |
arguments_for_coroutines = [] |
|
|
155 |
loop = asyncio.new_event_loop() |
|
|
156 |
for ner_type in ['biomedner']: |
|
|
157 |
arguments_for_coroutines.append([ner_type, pubtator_file, output_biomedner, base_name, loop]) |
|
|
158 |
async_result = loop.run_until_complete(self.async_ner(arguments_for_coroutines)) |
|
|
159 |
loop.close() |
|
|
160 |
biomedner_elapse_time = async_result['biomedner_elapse_time'] |
|
|
161 |
|
|
|
162 |
# get output result to merge |
|
|
163 |
tagged_docs = async_result['tagged_docs'] |
|
|
164 |
num_entities = async_result['num_entities'] |
|
|
165 |
|
|
|
166 |
ner_elapse_time = time.time() - ner_start_time |
|
|
167 |
print(datetime.now().strftime(self.time_format), |
|
|
168 |
f'[{base_name}] ALL NER {ner_elapse_time} sec') |
|
|
169 |
|
|
|
170 |
# time record |
|
|
171 |
tagged_docs[0]['elapse_time'] = { |
|
|
172 |
'biomedner_elapse_time':biomedner_elapse_time, |
|
|
173 |
'ner_elapse_time': ner_elapse_time, |
|
|
174 |
} |
|
|
175 |
|
|
|
176 |
# Delete temp files |
|
|
177 |
os.remove(input_biomedner) |
|
|
178 |
os.remove(output_biomedner) |
|
|
179 |
|
|
|
180 |
return tagged_docs[0] |
|
|
181 |
|
|
|
182 |
# generate id for temporary files |
|
|
183 |
def generate_base_name(self, text): |
|
|
184 |
# add time.time() to avoid collision |
|
|
185 |
base_name = hashlib.sha224((text+str(time.time())).encode('utf-8')).hexdigest() |
|
|
186 |
return base_name |
|
|
187 |
|
|
|
188 |
async def async_ner(self, arguments): |
|
|
189 |
coroutines = [self._ner_wrap(*arg) for arg in arguments] |
|
|
190 |
result = await asyncio.gather(*coroutines) |
|
|
191 |
result = {k:v for e in result for k,v in e.items()} # merge |
|
|
192 |
return result |
|
|
193 |
|
|
|
194 |
async def _ner_wrap(self, ner_type, pubtator_file, output_biomedner, base_name, loop): |
|
|
195 |
if ner_type == 'biomedner': |
|
|
196 |
# Run neural model |
|
|
197 |
start_time = time.time() |
|
|
198 |
biomedner_resp = await async_tell_inputfile(self.biomedner_host, |
|
|
199 |
self.biomedner_port, |
|
|
200 |
pubtator_file, |
|
|
201 |
loop) |
|
|
202 |
|
|
|
203 |
with open(output_biomedner, 'r', encoding='utf-8') as f: |
|
|
204 |
tagged_docs = [json.load(f)] |
|
|
205 |
|
|
|
206 |
num_entities = tagged_docs[0]['num_entities'] |
|
|
207 |
if tagged_docs is None: |
|
|
208 |
return None |
|
|
209 |
|
|
|
210 |
assert len(tagged_docs) == 1 |
|
|
211 |
biomedner_elapse_time = time.time() - start_time |
|
|
212 |
print(datetime.now().strftime(self.time_format), |
|
|
213 |
f'[{base_name}] Multi-task NER {biomedner_elapse_time} sec, #entities: {num_entities}') |
|
|
214 |
|
|
|
215 |
return {"biomedner_elapse_time": biomedner_elapse_time, |
|
|
216 |
"tagged_docs": tagged_docs, |
|
|
217 |
"num_entities": num_entities} |
|
|
218 |
|
|
|
219 |
async def async_tell_inputfile(host, port, inputfile, loop): |
|
|
220 |
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
|
|
221 |
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) |
|
|
222 |
try: |
|
|
223 |
sock.connect((host, port)) |
|
|
224 |
input_str = inputfile |
|
|
225 |
input_stream = struct.pack('>H', len(input_str)) + input_str.encode( |
|
|
226 |
'utf-8') |
|
|
227 |
sock.send(input_stream) |
|
|
228 |
# output_stream = sock.recv(512) |
|
|
229 |
output_stream = await loop.run_in_executor(None, sock.recv, 512) # for async |
|
|
230 |
resp = output_stream.decode('utf-8')[2:] |
|
|
231 |
|
|
|
232 |
sock.close() |
|
|
233 |
return resp |
|
|
234 |
except ConnectionRefusedError as e: |
|
|
235 |
print(e) |
|
|
236 |
return None |
|
|
237 |
except TimeoutError as e: |
|
|
238 |
print(e) |
|
|
239 |
return None |
|
|
240 |
except ConnectionResetError as e: |
|
|
241 |
print(e) |
|
|
242 |
return None |
|
|
243 |
|
|
|
244 |
def sync_tell_inputfile(host, port, inputfile): |
|
|
245 |
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) |
|
|
246 |
sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) |
|
|
247 |
try: |
|
|
248 |
sock.connect((host, port)) |
|
|
249 |
input_str = inputfile |
|
|
250 |
input_stream = struct.pack('>H', len(input_str)) + input_str.encode( |
|
|
251 |
'utf-8') |
|
|
252 |
sock.send(input_stream) |
|
|
253 |
output_stream = sock.recv(512) # for sync |
|
|
254 |
# output_stream = await loop.run_in_executor(None, sock.recv, 512) |
|
|
255 |
resp = output_stream.decode('utf-8')[2:] |
|
|
256 |
|
|
|
257 |
sock.close() |
|
|
258 |
return resp |
|
|
259 |
except ConnectionRefusedError as e: |
|
|
260 |
print(e) |
|
|
261 |
return None |
|
|
262 |
except TimeoutError as e: |
|
|
263 |
print(e) |
|
|
264 |
return None |
|
|
265 |
except ConnectionResetError as e: |
|
|
266 |
print(e) |
|
|
267 |
return None |
|
|
268 |
|
|
|
269 |
def delete_files(dirname): |
|
|
270 |
if not os.path.exists(dirname): |
|
|
271 |
return |
|
|
272 |
|
|
|
273 |
n_deleted = 0 |
|
|
274 |
for f in os.listdir(dirname): |
|
|
275 |
f_path = os.path.join(dirname, f) |
|
|
276 |
if not os.path.isfile(f_path): |
|
|
277 |
continue |
|
|
278 |
# print('Delete', f_path) |
|
|
279 |
os.remove(f_path) |
|
|
280 |
n_deleted += 1 |
|
|
281 |
print(dirname, n_deleted) |
|
|
282 |
|
|
|
283 |
if __name__ == '__main__': |
|
|
284 |
import argparse |
|
|
285 |
|
|
|
286 |
argparser = argparse.ArgumentParser() |
|
|
287 |
argparser.add_argument('--max_word_len', type=int, help='word max chars', |
|
|
288 |
default=50) |
|
|
289 |
argparser.add_argument('--seed', type=int, help='seed value', default=2019) |
|
|
290 |
|
|
|
291 |
argparser.add_argument('--biomedner_home', |
|
|
292 |
help='biomedical language model home', |
|
|
293 |
default=os.path.join(os.path.expanduser('~'), |
|
|
294 |
'biomedner', 'biomednerHome')) |
|
|
295 |
argparser.add_argument('--biomedner_host', |
|
|
296 |
help='biomedical language model host', default='localhost') |
|
|
297 |
argparser.add_argument('--biomedner_port', type=int, |
|
|
298 |
help='biomedical language model port', default=18894) |
|
|
299 |
argparser.add_argument('--time_format', |
|
|
300 |
help='time format', default='[%d/%b/%Y %H:%M:%S.%f]') |
|
|
301 |
argparser.add_argument("--keep_files", action="store_true") |
|
|
302 |
argparser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") |
|
|
303 |
args = argparser.parse_args() |
|
|
304 |
biomedner = RunBioMedNER( |
|
|
305 |
max_word_len=args.max_word_len, |
|
|
306 |
seed=args.seed, |
|
|
307 |
biomedner_home=args.biomedner_home, |
|
|
308 |
biomedner_host=args.biomedner_host, |
|
|
309 |
biomedner_port=args.biomedner_port, |
|
|
310 |
time_format=args.time_format, |
|
|
311 |
keep_files=args.keep_files, |
|
|
312 |
no_cuda=args.no_cuda, |
|
|
313 |
) |
|
|
314 |
text = "KRAS is a proto-oncogene involved in various cancers." |
|
|
315 |
result = biomedner.annotate_text(text.lower()) |
|
|
316 |
print(result) |