Diff of /benchmark/utils.py [000000] .. [bc9e98]

Switch to unified view

a b/benchmark/utils.py
1
###### import ######
2
3
4
import numpy as np 
5
try:
6
    from rdkit import Chem 
7
    from rdkit.Chem import AllChem
8
except:
9
    pass 
10
###### import ######
11
12
13
14
def plot_hist(prefix_name, prediction, label):
15
    import seaborn as sns
16
    import matplotlib.pyplot as plt
17
    figure_name = prefix_name + "_histogram.png"
18
    positive_prediction = [prediction[i] for i in range(len(label)) if label[i]==1]
19
    negative_prediction = [prediction[i] for i in range(len(label)) if label[i]==0]
20
    sns.distplot(positive_prediction, hist=True,  kde=False, bins=50, color = 'blue', label = 'positive')
21
    sns.distplot(negative_prediction, hist=True,  kde=False, bins=50, color = 'red', label = 'negative')
22
    plt.legend()
23
    plt.savefig(figure_name)
24
    return 
25
26
def replace_strange_symbol(text):
27
    for i in "[]'\n/":
28
        text = text.replace(i,'_')
29
    return text
30
31
#  xml read blog:  https://blog.csdn.net/yiluochenwu/article/details/23515923 
32
def walkData(root_node, prefix, result_list):
33
    temp_list =[prefix + '/' + root_node.tag, root_node.text]
34
    result_list.append(temp_list)
35
    children_node = root_node.getchildren()
36
    if len(children_node) == 0:
37
        return
38
    for child in children_node:
39
        walkData(child, prefix = prefix + '/' + root_node.tag, result_list = result_list)
40
41
42
def dynamic_programming(s1, s2):
43
    arr2d = [[0 for i in s2] for j in s1]
44
    if s1[0] == s2[0]:
45
        arr2d[0][0] = 1
46
    for i in range(1, len(s1)):
47
        if s1[i]==s2[0]:
48
            arr2d[i][0] = 1
49
        else:
50
            arr2d[i][0] = arr2d[i-1][0] 
51
    for i in range(1,len(s2)):
52
        if s2[i]==s1[0]:
53
            arr2d[0][i] = 1 
54
        else:
55
            arr2d[0][i] = arr2d[0][i-1]
56
    for i in range(1,len(s1)):
57
        for j in range(1,len(s2)):
58
            if s1[i] == s2[j]:
59
                arr2d[i][j] = arr2d[i-1][j-1] + 1 
60
            else:
61
                arr2d[i][j] = max(arr2d[i-1][j], arr2d[i][j-1])
62
    return arr2d[len(s1)-1][len(s2)-1]
63
64
65
def get_path_of_all_xml_file():
66
    input_file = "./data/all_xml"
67
    with open(input_file, 'r') as fin:
68
        lines = fin.readlines()
69
    input_file_lst = [i.strip() for i in lines]
70
    return input_file_lst 
71
72
73
def remove_multiple_space(text):
74
    text = ' '.join(text.split())
75
    return text 
76
77
def nctid_2_xml_file_path(nctid):
78
    assert len(nctid)==11
79
    prefix = nctid[:7] + "xxxx"
80
    datafolder = os.path.join("./ClinicalTrialGov/", prefix, nctid+".xml")
81
    return datafolder 
82
83
84
def fingerprints_from_mol(mol):
85
    fp = AllChem.GetMorganFingerprint(mol, 3, useCounts=True, useFeatures=True)
86
    size = 2048
87
    nfp = np.zeros((1, size), np.int32)
88
    for idx,v in fp.GetNonzeroElements().items():
89
        nidx = idx%size
90
        nfp[0, nidx] += int(v)
91
    return nfp
92
93
def smiles2fp(smiles):
94
    try:
95
        mol = Chem.MolFromSmiles(smile)
96
        fp = fingerprints_from_mol(mol)
97
        return fp 
98
    except:
99
        return np.zeros((1, 2048), np.int32)
100
101
def smiles_lst2fp(smiles_lst):
102
    fp_lst = [smiles2fp(smiles) for smiles in smiles_lst]
103
    fp_mat = np.concatenate(fp_lst, 0)
104
    fp = np.mean(fp_mat,0)
105
    return fp   
106
107
108
109
110
111
if __name__ == "__main__":
112
    text = "interpret_result/NCT00329602__completed____1__1.7650960683822632__phase 4__['restless legs syndrome']__['placebo', 'ropinirole'].png"
113
    print(replace_strange_symbol(text))
114
115
116
117
118
119
120
# if __name__ == "__main__":
121
#   input_file_lst = get_path_of_all_xml_file() 
122
#   print(input_file_lst[:5])
123
# '''
124
# input_file_lst = [ 
125
#   'ClinicalTrialGov/NCT0000xxxx/NCT00000102.xml', 
126
#   'ClinicalTrialGov/NCT0000xxxx/NCT00000104.xml', 
127
#   'ClinicalTrialGov/NCT0000xxxx/NCT00000105.xml', 
128
#     ... ]
129
# '''
130
131
132
133
# if __name__ == "__main__":
134
#   s1 = "328943"
135
#   s2 = "13785"
136
#   assert dynamic_programming(s1, s2)==2 
137
138
139
140
141