a b/data_process.py
1
import pandas as pd
2
from difflib import SequenceMatcher
3
4
5
6
7
def init_data():
8
    ''' Rearrange data for further manipulations'''
9
10
    df = pd.read_csv('./NER-Medical-Document/data/ddi_description.csv')
11
    name1 = df['first drug name']
12
    name2 = df['second drug name']
13
    sentence = df['description']
14
    data = []
15
    for i in range(len(name1)):
16
            data.append([sentence.iloc[i], [(name1.iloc[i], 'Chemical'), (name2.iloc[i], 'Chemical')]])
17
    df = pd.DataFrame(data, columns=['text', 'annotation'])
18
    return df
19
20
def matcher(string, pattern):
21
    ''' Return the start and end index of any pattern present in the text'''
22
23
    match_list = []
24
    pattern = pattern.strip()
25
    seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
26
    match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
27
    if (match.size == len(pattern)):
28
        start = match.a
29
        end = match.a + match.size
30
        match_tup = (start, end)
31
        string = string.replace(pattern, "X" * len(pattern), 1)
32
        match_list.append(match_tup)
33
        
34
    return match_list, string
35
36
def mark_sentence(s, match_list):
37
    ''' Marks all the entities in the sentence as per the BIO scheme. '''
38
39
    word_dict = {}
40
    for word in s.split():
41
        word_dict[word] = 'O'
42
        
43
    for start, end, e_type in match_list:
44
        temp_str = s[start:end]
45
        tmp_list = temp_str.split()
46
        if len(tmp_list) > 1:
47
            word_dict[tmp_list[0]] = 'B-' + e_type
48
            for w in tmp_list[1:]:
49
                word_dict[w] = 'I-' + e_type
50
        else:
51
            word_dict[temp_str] = 'B-' + e_type
52
    return word_dict
53
54
def clean(text, clean_punctuation=False, remove_end_point=True):
55
    ''' Just a helper fuction to add a space before the punctuations for better tokenization '''
56
57
    filters = ["!", "#", "$", "%", "&", ".", ":", ";", "<", "=", ">", "?", "@",
58
               "\\", "_", "`", "{", "}", "~", "'"]
59
60
    # cleaning punctation can cause problems with my data        
61
    if clean_punctuation:
62
        for i in text:
63
            if i in filters:
64
                text = text.replace(i, " " + i)
65
            
66
    if remove_end_point:
67
        return text[:-1]
68
69
def create_data(df, filepath):
70
    ''' The function responsible for the creation of data in the appropriate format '''
71
72
    with open(filepath , 'w') as f:
73
        for text, annotation in zip(df.text, df.annotation):
74
            text = clean(text)
75
            text_ = text        
76
            match_list = []
77
            for i in annotation:
78
                a, text_ = matcher(text, i[0])
79
                match_list.append((a[0][0], a[0][1], i[1]))
80
81
            d = mark_sentence(text, match_list)
82
83
            for i in d.keys():
84
                f.writelines(i + ' ' + d[i] +'\n')
85
            f.writelines('\n')
86
            
87
def main():
88
    ''' main function, combines previous function to create train, dev and test sets'''
89
90
    data = init_data()
91
    data.sample(frac=1).reset_index(drop=True) # shuffle the data
92
93
    ## path to save the txt file.
94
    filepath_train = './NER-Medical-Document/data/train.txt'
95
    filepath_test = './NER-Medical-Document/data/test.txt'
96
    filepath_dev = './NER-Medical-Document/data/dev.txt'
97
    
98
    ## creating the file.
99
    length = len(data)
100
    data_train, data_test, data_dev = data[:int(length*0.8)], data[int(length*0.8):int(length*0.9)], data[int(length*0.9):]
101
    create_data(data_train, filepath_train)
102
    create_data(data_dev, filepath_dev)
103
    create_data(data_test, filepath_test)
104
105
106
107
108
if __name__ == '__main__':
109
    main()