[9063a2]: / data_process.py

Download this file

110 lines (82 with data), 3.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
from difflib import SequenceMatcher
def init_data():
''' Rearrange data for further manipulations'''
df = pd.read_csv('./NER-Medical-Document/data/ddi_description.csv')
name1 = df['first drug name']
name2 = df['second drug name']
sentence = df['description']
data = []
for i in range(len(name1)):
data.append([sentence.iloc[i], [(name1.iloc[i], 'Chemical'), (name2.iloc[i], 'Chemical')]])
df = pd.DataFrame(data, columns=['text', 'annotation'])
return df
def matcher(string, pattern):
''' Return the start and end index of any pattern present in the text'''
match_list = []
pattern = pattern.strip()
seqMatch = SequenceMatcher(None, string, pattern, autojunk=False)
match = seqMatch.find_longest_match(0, len(string), 0, len(pattern))
if (match.size == len(pattern)):
start = match.a
end = match.a + match.size
match_tup = (start, end)
string = string.replace(pattern, "X" * len(pattern), 1)
match_list.append(match_tup)
return match_list, string
def mark_sentence(s, match_list):
''' Marks all the entities in the sentence as per the BIO scheme. '''
word_dict = {}
for word in s.split():
word_dict[word] = 'O'
for start, end, e_type in match_list:
temp_str = s[start:end]
tmp_list = temp_str.split()
if len(tmp_list) > 1:
word_dict[tmp_list[0]] = 'B-' + e_type
for w in tmp_list[1:]:
word_dict[w] = 'I-' + e_type
else:
word_dict[temp_str] = 'B-' + e_type
return word_dict
def clean(text, clean_punctuation=False, remove_end_point=True):
''' Just a helper fuction to add a space before the punctuations for better tokenization '''
filters = ["!", "#", "$", "%", "&", ".", ":", ";", "<", "=", ">", "?", "@",
"\\", "_", "`", "{", "}", "~", "'"]
# cleaning punctation can cause problems with my data
if clean_punctuation:
for i in text:
if i in filters:
text = text.replace(i, " " + i)
if remove_end_point:
return text[:-1]
def create_data(df, filepath):
''' The function responsible for the creation of data in the appropriate format '''
with open(filepath , 'w') as f:
for text, annotation in zip(df.text, df.annotation):
text = clean(text)
text_ = text
match_list = []
for i in annotation:
a, text_ = matcher(text, i[0])
match_list.append((a[0][0], a[0][1], i[1]))
d = mark_sentence(text, match_list)
for i in d.keys():
f.writelines(i + ' ' + d[i] +'\n')
f.writelines('\n')
def main():
''' main function, combines previous function to create train, dev and test sets'''
data = init_data()
data.sample(frac=1).reset_index(drop=True) # shuffle the data
## path to save the txt file.
filepath_train = './NER-Medical-Document/data/train.txt'
filepath_test = './NER-Medical-Document/data/test.txt'
filepath_dev = './NER-Medical-Document/data/dev.txt'
## creating the file.
length = len(data)
data_train, data_test, data_dev = data[:int(length*0.8)], data[int(length*0.8):int(length*0.9)], data[int(length*0.9):]
create_data(data_train, filepath_train)
create_data(data_dev, filepath_dev)
create_data(data_test, filepath_test)
if __name__ == '__main__':
main()