a b/rule_based_ner/dict_ner.py
1
from __future__ import annotations
2
3
from typing import List
4
from ehr import HealthRecord
5
from annotations import Entity
6
from collections import defaultdict
7
import re
8
9
10
class DictNER:
11
    '''
12
    A dictionary based NER model.
13
    '''
14
    def __init__(self):
15
        self.ner_re: dict = {}
16
17
    def _get_clean_re(self, entity_list: List[str]) -> str:
18
        '''
19
        Generates a regular expression from a list of entities
20
21
        Parameters
22
        ----------
23
        entity_list : List[str]
24
            List of entity text.
25
26
        Returns
27
        -------
28
        entity_re : str
29
            Regular expression.
30
31
        '''
32
        regex_chars = ['(', ')', '[', ']', '{', '}', '+', '*', '?', '$', '^', '&']
33
        
34
        for i in range(len(entity_list)):
35
            # We need to add a \ so it does not take entity text as regex
36
            # character
37
            for char in regex_chars:
38
                entity_list[i] = entity_list[i].replace(char, 
39
                                                        '\\' + char)
40
        
41
        # A space/new line/tab before and after the text to indicate
42
        # a seperate word
43
        entity_re = '[\n| |\t]|[\n| |\t]'.join(entity_list)
44
        entity_re = '[\n| |\t]' + entity_re + '[\n| |\t]'
45
        
46
        return entity_re
47
        
48
    def fit(self, train_data: List[HealthRecord]) -> DictNER:
49
        '''
50
        Generates a dictionary for the model
51
52
        Parameters
53
        ----------
54
        train_data : List[HealthRecord]
55
            Records to generate the dictionary from.
56
57
        Returns
58
        -------
59
        DictNER
60
            Self object.
61
62
        '''
63
        ner_dict = defaultdict(list)
64
        
65
        for data in train_data:
66
            for ent in data.entities.values():
67
                # We have a specific RE for Dosage
68
                if ent.name != 'Strength':
69
                    # Ignore text with length 1
70
                    if ent.ann_text.lower() not in ner_dict[ent.name]\
71
                        and len(ent.ann_text) > 1: 
72
                        ner_dict[ent.name].append(ent.ann_text.lower())
73
        
74
        for name, entity_list in ner_dict.items():
75
            ner_dict[name] = self._get_clean_re(entity_list)
76
        
77
        # Dosage is just a number followed by mg or mcg
78
        ner_dict['Strength'] = '\d+[ ]*(?:mg|mcg)'
79
        self.ner_re = dict(ner_dict)
80
        return self
81
    
82
    def predict(self, test_data: List[HealthRecord])\
83
            -> List[List[Entity]]:
84
        '''
85
        Returns character ranges for all predicted entities
86
87
        Parameters
88
        ----------
89
        test_data : List[HealthRecord]
90
            Text to predict the entities.
91
92
        Returns
93
        -------
94
        List[List[Entity]]
95
            Predictions for each example. Each prediction list 
96
            contains several Entity objects.
97
98
        '''
99
        predictions = []
100
        for data in test_data:
101
            entities = []
102
            j = 1
103
            for ent_name, ent_re in self.ner_re.items():
104
                # Get the start and end character ranges of entities
105
                # Remove the extra space at the start and end of entity
106
                ranges = [(m.start(0) + 1, m.end(0) - 1, ent_name) \
107
                                      for m in re.finditer(ent_re, data.text, re.IGNORECASE)]
108
                 
109
                # Convert to Entity Objects
110
                for r in ranges:
111
                    ent = Entity(entity_id = "T" + str(j))
112
                    ent.set_range([r[0], r[1]])
113
                    ent.set_entity_type(r[2])
114
                    entities.append(ent)
115
                    j += 1
116
            
117
            predictions.append(entities)
118
                        
119
        return predictions