|
a |
|
b/data_process.py |
|
|
1 |
import pandas as pd |
|
|
2 |
from difflib import SequenceMatcher |
|
|
3 |
|
|
|
4 |
|
|
|
5 |
|
|
|
6 |
|
|
|
7 |
def init_data(): |
|
|
8 |
''' Rearrange data for further manipulations''' |
|
|
9 |
|
|
|
10 |
df = pd.read_csv('./NER-Medical-Document/data/ddi_description.csv') |
|
|
11 |
name1 = df['first drug name'] |
|
|
12 |
name2 = df['second drug name'] |
|
|
13 |
sentence = df['description'] |
|
|
14 |
data = [] |
|
|
15 |
for i in range(len(name1)): |
|
|
16 |
data.append([sentence.iloc[i], [(name1.iloc[i], 'Chemical'), (name2.iloc[i], 'Chemical')]]) |
|
|
17 |
df = pd.DataFrame(data, columns=['text', 'annotation']) |
|
|
18 |
return df |
|
|
19 |
|
|
|
20 |
def matcher(string, pattern): |
|
|
21 |
''' Return the start and end index of any pattern present in the text''' |
|
|
22 |
|
|
|
23 |
match_list = [] |
|
|
24 |
pattern = pattern.strip() |
|
|
25 |
seqMatch = SequenceMatcher(None, string, pattern, autojunk=False) |
|
|
26 |
match = seqMatch.find_longest_match(0, len(string), 0, len(pattern)) |
|
|
27 |
if (match.size == len(pattern)): |
|
|
28 |
start = match.a |
|
|
29 |
end = match.a + match.size |
|
|
30 |
match_tup = (start, end) |
|
|
31 |
string = string.replace(pattern, "X" * len(pattern), 1) |
|
|
32 |
match_list.append(match_tup) |
|
|
33 |
|
|
|
34 |
return match_list, string |
|
|
35 |
|
|
|
36 |
def mark_sentence(s, match_list): |
|
|
37 |
''' Marks all the entities in the sentence as per the BIO scheme. ''' |
|
|
38 |
|
|
|
39 |
word_dict = {} |
|
|
40 |
for word in s.split(): |
|
|
41 |
word_dict[word] = 'O' |
|
|
42 |
|
|
|
43 |
for start, end, e_type in match_list: |
|
|
44 |
temp_str = s[start:end] |
|
|
45 |
tmp_list = temp_str.split() |
|
|
46 |
if len(tmp_list) > 1: |
|
|
47 |
word_dict[tmp_list[0]] = 'B-' + e_type |
|
|
48 |
for w in tmp_list[1:]: |
|
|
49 |
word_dict[w] = 'I-' + e_type |
|
|
50 |
else: |
|
|
51 |
word_dict[temp_str] = 'B-' + e_type |
|
|
52 |
return word_dict |
|
|
53 |
|
|
|
54 |
def clean(text, clean_punctuation=False, remove_end_point=True): |
|
|
55 |
''' Just a helper fuction to add a space before the punctuations for better tokenization ''' |
|
|
56 |
|
|
|
57 |
filters = ["!", "#", "$", "%", "&", ".", ":", ";", "<", "=", ">", "?", "@", |
|
|
58 |
"\\", "_", "`", "{", "}", "~", "'"] |
|
|
59 |
|
|
|
60 |
# cleaning punctation can cause problems with my data |
|
|
61 |
if clean_punctuation: |
|
|
62 |
for i in text: |
|
|
63 |
if i in filters: |
|
|
64 |
text = text.replace(i, " " + i) |
|
|
65 |
|
|
|
66 |
if remove_end_point: |
|
|
67 |
return text[:-1] |
|
|
68 |
|
|
|
69 |
def create_data(df, filepath): |
|
|
70 |
''' The function responsible for the creation of data in the appropriate format ''' |
|
|
71 |
|
|
|
72 |
with open(filepath , 'w') as f: |
|
|
73 |
for text, annotation in zip(df.text, df.annotation): |
|
|
74 |
text = clean(text) |
|
|
75 |
text_ = text |
|
|
76 |
match_list = [] |
|
|
77 |
for i in annotation: |
|
|
78 |
a, text_ = matcher(text, i[0]) |
|
|
79 |
match_list.append((a[0][0], a[0][1], i[1])) |
|
|
80 |
|
|
|
81 |
d = mark_sentence(text, match_list) |
|
|
82 |
|
|
|
83 |
for i in d.keys(): |
|
|
84 |
f.writelines(i + ' ' + d[i] +'\n') |
|
|
85 |
f.writelines('\n') |
|
|
86 |
|
|
|
87 |
def main(): |
|
|
88 |
''' main function, combines previous function to create train, dev and test sets''' |
|
|
89 |
|
|
|
90 |
data = init_data() |
|
|
91 |
data.sample(frac=1).reset_index(drop=True) # shuffle the data |
|
|
92 |
|
|
|
93 |
## path to save the txt file. |
|
|
94 |
filepath_train = './NER-Medical-Document/data/train.txt' |
|
|
95 |
filepath_test = './NER-Medical-Document/data/test.txt' |
|
|
96 |
filepath_dev = './NER-Medical-Document/data/dev.txt' |
|
|
97 |
|
|
|
98 |
## creating the file. |
|
|
99 |
length = len(data) |
|
|
100 |
data_train, data_test, data_dev = data[:int(length*0.8)], data[int(length*0.8):int(length*0.9)], data[int(length*0.9):] |
|
|
101 |
create_data(data_train, filepath_train) |
|
|
102 |
create_data(data_dev, filepath_dev) |
|
|
103 |
create_data(data_test, filepath_test) |
|
|
104 |
|
|
|
105 |
|
|
|
106 |
|
|
|
107 |
|
|
|
108 |
if __name__ == '__main__': |
|
|
109 |
main() |