[bc9e98]: / HINT / utils.py

Download this file

148 lines (111 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
###### import ######
import pickle
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.info')
RDLogger.DisableLog('rdApp.*')
###### import ######
def plot_hist(prefix_name, prediction, label):
import seaborn as sns
import matplotlib.pyplot as plt
figure_name = prefix_name + "_histogram.png"
positive_prediction = [prediction[i] for i in range(len(label)) if label[i]==1]
negative_prediction = [prediction[i] for i in range(len(label)) if label[i]==0]
save_file_name = "results/" + prefix_name.split('/')[-1] + "_positive_negative.pkl"
pickle.dump((positive_prediction, negative_prediction), open(save_file_name, 'wb'))
sns.distplot(positive_prediction, hist=True, kde=False, bins=20, color = 'blue', label = 'success') #### bins = 50 -> 20
sns.distplot(negative_prediction, hist=True, kde=False, bins=20, color = 'red', label = 'fail')
plt.xlabel("predicted success probability", fontsize=24)
plt.ylabel("frequencies", fontsize = 25)
plt.legend(fontsize = 21)
plt.tight_layout()
# plt.show()
plt.savefig(figure_name)
return
def replace_strange_symbol(text):
for i in "[]'\n/":
text = text.replace(i,'_')
return text
# xml read blog: https://blog.csdn.net/yiluochenwu/article/details/23515923
def walkData(root_node, prefix, result_list):
temp_list =[prefix + '/' + root_node.tag, root_node.text]
result_list.append(temp_list)
children_node = root_node.getchildren()
if len(children_node) == 0:
return
for child in children_node:
walkData(child, prefix = prefix + '/' + root_node.tag, result_list = result_list)
def dynamic_programming(s1, s2):
arr2d = [[0 for i in s2] for j in s1]
if s1[0] == s2[0]:
arr2d[0][0] = 1
for i in range(1, len(s1)):
if s1[i]==s2[0]:
arr2d[i][0] = 1
else:
arr2d[i][0] = arr2d[i-1][0]
for i in range(1,len(s2)):
if s2[i]==s1[0]:
arr2d[0][i] = 1
else:
arr2d[0][i] = arr2d[0][i-1]
for i in range(1,len(s1)):
for j in range(1,len(s2)):
if s1[i] == s2[j]:
arr2d[i][j] = arr2d[i-1][j-1] + 1
else:
arr2d[i][j] = max(arr2d[i-1][j], arr2d[i][j-1])
return arr2d[len(s1)-1][len(s2)-1]
def get_path_of_all_xml_file():
input_file = "./data/all_xml"
with open(input_file, 'r') as fin:
lines = fin.readlines()
input_file_lst = [i.strip() for i in lines]
return input_file_lst
def remove_multiple_space(text):
text = ' '.join(text.split())
return text
def nctid_2_xml_file_path(nctid):
assert len(nctid)==11
prefix = nctid[:7] + "xxxx"
datafolder = os.path.join("./ClinicalTrialGov/", prefix, nctid+".xml")
return datafolder
def fingerprints_from_mol(mol):
fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
size = 2048
nfp = np.zeros((1, size), np.int32)
for idx,v in fp.GetNonzeroElements().items():
nidx = idx%size
nfp[0, nidx] += int(v)
return nfp
def smiles2fp(smiles):
try:
mol = Chem.MolFromSmiles(smile)
fp = fingerprints_from_mol(mol)
return fp
except:
return np.zeros((1, 2048), np.int32)
def smiles_lst2fp(smiles_lst):
fp_lst = [smiles2fp(smiles) for smiles in smiles_lst]
fp_mat = np.concatenate(fp_lst, 0)
fp = np.mean(fp_mat,0)
return fp
if __name__ == "__main__":
text = "interpret_result/NCT00329602__completed____1__1.7650960683822632__phase 4__['restless legs syndrome']__['placebo', 'ropinirole'].png"
print(replace_strange_symbol(text))
# if __name__ == "__main__":
# input_file_lst = get_path_of_all_xml_file()
# print(input_file_lst[:5])
# '''
# input_file_lst = [
# 'ClinicalTrialGov/NCT0000xxxx/NCT00000102.xml',
# 'ClinicalTrialGov/NCT0000xxxx/NCT00000104.xml',
# 'ClinicalTrialGov/NCT0000xxxx/NCT00000105.xml',
# ... ]
# '''
# if __name__ == "__main__":
# s1 = "328943"
# s2 = "13785"
# assert dynamic_programming(s1, s2)==2