a b/src/Parser/biomedner_server.py
1
import os
2
import json
3
import socket
4
import struct
5
import argparse
6
7
from datetime import datetime
8
from biomedner_init import BioMedNER
9
from ops import filter_entities, pubtator2dict_list
10
11
def count_entities(data):
12
    num_entities = 0
13
    for d in data:
14
        if 'entities' not in d:
15
            continue
16
        for ent_type, entities in d['entities'].items():
17
            num_entities += len(entities)
18
19
    return num_entities
20
21
def biomedner_recognize(model, dict_path, base_name, args):
22
    input_mt_ner = os.path.join(args.biomedner_home, 'input',
23
                                f'{dict_path[2:]}.PubTator')
24
    output_mt_ner = os.path.join(args.biomedner_home, 'output',
25
                                f'{dict_path[2:]}.json')
26
    
27
    dict_list = pubtator2dict_list(input_mt_ner)
28
29
    res = model.recognize(
30
        input_dl=dict_list,
31
        base_name=base_name
32
    )
33
34
    if res is None:
35
        return None, 0
36
37
    num_filtered_species_per_doc = filter_entities(res)
38
    for n_f_spcs in num_filtered_species_per_doc:
39
        if n_f_spcs[1] > 0:
40
            print(datetime.now().strftime(args.time_format),
41
                  '[{}] Filtered {} species'
42
                  .format(base_name, n_f_spcs[1]))
43
    num_entities = count_entities(res)
44
45
    res[0]['num_entities'] = num_entities
46
    # Write output str to a .PubTator format file
47
    with open(output_mt_ner, 'w', encoding='utf-8') as f:
48
        json.dump(res[0], f)
49
50
def run_server(model, args):
51
    host = args.biomedner_host
52
    port = args.biomedner_port
53
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
54
        s.bind((host, port))
55
        s.listen(1)
56
        while True:
57
            conn, addr = s.accept()
58
            dict_path = conn.recv(512).decode('utf-8')
59
            base_name = dict_path.split('.')[0]
60
            # hotfix
61
            base_name = base_name.replace("\x00A","")
62
            
63
            biomedner_recognize(model, dict_path, base_name, args)
64
            
65
            output_stream = struct.pack('>H', len(dict_path)) + dict_path.encode(
66
                'utf-8')
67
68
            conn.send(output_stream)
69
            conn.close()
70
71
72
if __name__ == "__main__":
73
    argparser = argparse.ArgumentParser()
74
    argparser.add_argument('--seed', type=int, help='random seed for initialization',
75
                            default=1)
76
    argparser.add_argument('--model_name_or_path', default='dmis-lab/bern2-ner')
77
    argparser.add_argument('--max_seq_length', type=int, help='The maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.',
78
                            default=128)
79
    argparser.add_argument('--biomedner_home',
80
                           help='biomedical language model home')         
81
    argparser.add_argument('--biomedner_host',
82
                           help='biomedical language model host', default='localhost')
83
    argparser.add_argument('--biomedner_port', type=int, 
84
                           help='biomedical language model port', default=18894)
85
    argparser.add_argument('--time_format',
86
                            help='time format', default='[%d/%b/%Y %H:%M:%S.%f]')    
87
    argparser.add_argument('--no_cuda', action="store_true", help="Avoid using CUDA when available")
88
    args = argparser.parse_args()
89
    mt_ner = BioMedNER(args)
90
    
91
    run_server(mt_ner, args)