[f87529]: / src / Parser / biomedner_server.py

Download this file

91 lines (75 with data), 3.4 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import os
import json
import socket
import struct
import argparse
from datetime import datetime
from biomedner_init import BioMedNER
from ops import filter_entities, pubtator2dict_list
def count_entities(data):
num_entities = 0
for d in data:
if 'entities' not in d:
continue
for ent_type, entities in d['entities'].items():
num_entities += len(entities)
return num_entities
def biomedner_recognize(model, dict_path, base_name, args):
input_mt_ner = os.path.join(args.biomedner_home, 'input',
f'{dict_path[2:]}.PubTator')
output_mt_ner = os.path.join(args.biomedner_home, 'output',
f'{dict_path[2:]}.json')
dict_list = pubtator2dict_list(input_mt_ner)
res = model.recognize(
input_dl=dict_list,
base_name=base_name
)
if res is None:
return None, 0
num_filtered_species_per_doc = filter_entities(res)
for n_f_spcs in num_filtered_species_per_doc:
if n_f_spcs[1] > 0:
print(datetime.now().strftime(args.time_format),
'[{}] Filtered {} species'
.format(base_name, n_f_spcs[1]))
num_entities = count_entities(res)
res[0]['num_entities'] = num_entities
# Write output str to a .PubTator format file
with open(output_mt_ner, 'w', encoding='utf-8') as f:
json.dump(res[0], f)
def run_server(model, args):
host = args.biomedner_host
port = args.biomedner_port
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind((host, port))
s.listen(1)
while True:
conn, addr = s.accept()
dict_path = conn.recv(512).decode('utf-8')
base_name = dict_path.split('.')[0]
# hotfix
base_name = base_name.replace("\x00A","")
biomedner_recognize(model, dict_path, base_name, args)
output_stream = struct.pack('>H', len(dict_path)) + dict_path.encode(
'utf-8')
conn.send(output_stream)
conn.close()
if __name__ == "__main__":
argparser = argparse.ArgumentParser()
argparser.add_argument('--seed', type=int, help='random seed for initialization',
default=1)
argparser.add_argument('--model_name_or_path', default='dmis-lab/bern2-ner')
argparser.add_argument('--max_seq_length', type=int, help='The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.',
default=128)
argparser.add_argument('--biomedner_home',
help='biomedical language model home')
argparser.add_argument('--biomedner_host',
help='biomedical language model host', default='localhost')
argparser.add_argument('--biomedner_port', type=int,
help='biomedical language model port', default=18894)
argparser.add_argument('--time_format',
help='time format', default='[%d/%b/%Y %H:%M:%S.%f]')
argparser.add_argument('--no_cuda', action="store_true", help="Avoid using CUDA when available")
args = argparser.parse_args()
mt_ner = BioMedNER(args)
run_server(mt_ner, args)