|
a |
|
b/source/datahelper.py |
|
|
1 |
import sys, re, math, time |
|
|
2 |
import numpy as np |
|
|
3 |
import matplotlib.pyplot as plt |
|
|
4 |
import json |
|
|
5 |
import pickle |
|
|
6 |
import collections |
|
|
7 |
from collections import OrderedDict |
|
|
8 |
from matplotlib.pyplot import cm |
|
|
9 |
#from keras.preprocessing.sequence import pad_sequences |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
## ######################## ## |
|
|
13 |
# |
|
|
14 |
# Define CHARSET, CHARLEN |
|
|
15 |
# |
|
|
16 |
## ######################## ## |
|
|
17 |
|
|
|
18 |
# CHARPROTSET = { 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, \ |
|
|
19 |
# 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, \ |
|
|
20 |
# 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, 'X': 20, \ |
|
|
21 |
# 'O': 20, 'U': 20, |
|
|
22 |
# 'B': (2, 11), |
|
|
23 |
# 'Z': (3, 13), |
|
|
24 |
# 'J': (7, 9) } |
|
|
25 |
# CHARPROTLEN = 21 |
|
|
26 |
|
|
|
27 |
CHARPROTSET = { "A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6, |
|
|
28 |
"F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12, |
|
|
29 |
"O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18, |
|
|
30 |
"U": 19, "T": 20, "W": 21, |
|
|
31 |
"V": 22, "Y": 23, "X": 24, |
|
|
32 |
"Z": 25 } |
|
|
33 |
|
|
|
34 |
CHARPROTLEN = 25 |
|
|
35 |
|
|
|
36 |
CHARCANSMISET = { "#": 1, "%": 2, ")": 3, "(": 4, "+": 5, "-": 6, |
|
|
37 |
".": 7, "1": 8, "0": 9, "3": 10, "2": 11, "5": 12, |
|
|
38 |
"4": 13, "7": 14, "6": 15, "9": 16, "8": 17, "=": 18, |
|
|
39 |
"A": 19, "C": 20, "B": 21, "E": 22, "D": 23, "G": 24, |
|
|
40 |
"F": 25, "I": 26, "H": 27, "K": 28, "M": 29, "L": 30, |
|
|
41 |
"O": 31, "N": 32, "P": 33, "S": 34, "R": 35, "U": 36, |
|
|
42 |
"T": 37, "W": 38, "V": 39, "Y": 40, "[": 41, "Z": 42, |
|
|
43 |
"]": 43, "_": 44, "a": 45, "c": 46, "b": 47, "e": 48, |
|
|
44 |
"d": 49, "g": 50, "f": 51, "i": 52, "h": 53, "m": 54, |
|
|
45 |
"l": 55, "o": 56, "n": 57, "s": 58, "r": 59, "u": 60, |
|
|
46 |
"t": 61, "y": 62} |
|
|
47 |
|
|
|
48 |
CHARCANSMILEN = 62 |
|
|
49 |
|
|
|
50 |
CHARISOSMISET = {"#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2, |
|
|
51 |
"1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6, |
|
|
52 |
"9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43, |
|
|
53 |
"D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13, |
|
|
54 |
"O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51, |
|
|
55 |
"V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56, |
|
|
56 |
"b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60, |
|
|
57 |
"l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64} |
|
|
58 |
|
|
|
59 |
CHARISOSMILEN = 64 |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
## ######################## ## |
|
|
63 |
# |
|
|
64 |
# Encoding Helpers |
|
|
65 |
# |
|
|
66 |
## ######################## ## |
|
|
67 |
|
|
|
68 |
# Y = -(np.log10(Y/(math.pow(math.e,9)))) |
|
|
69 |
|
|
|
70 |
def one_hot_smiles(line, MAX_SMI_LEN, smi_ch_ind): |
|
|
71 |
X = np.zeros((MAX_SMI_LEN, len(smi_ch_ind))) #+1 |
|
|
72 |
|
|
|
73 |
for i, ch in enumerate(line[:MAX_SMI_LEN]): |
|
|
74 |
X[i, (smi_ch_ind[ch]-1)] = 1 |
|
|
75 |
|
|
|
76 |
return X #.tolist() |
|
|
77 |
|
|
|
78 |
def one_hot_sequence(line, MAX_SEQ_LEN, smi_ch_ind): |
|
|
79 |
X = np.zeros((MAX_SEQ_LEN, len(smi_ch_ind))) |
|
|
80 |
for i, ch in enumerate(line[:MAX_SEQ_LEN]): |
|
|
81 |
X[i, (smi_ch_ind[ch])-1] = 1 |
|
|
82 |
|
|
|
83 |
return X #.tolist() |
|
|
84 |
|
|
|
85 |
|
|
|
86 |
def label_smiles(line, MAX_SMI_LEN, smi_ch_ind): |
|
|
87 |
X = np.zeros(MAX_SMI_LEN) |
|
|
88 |
for i, ch in enumerate(line[:MAX_SMI_LEN]): # x, smi_ch_ind, y |
|
|
89 |
X[i] = smi_ch_ind[ch] |
|
|
90 |
|
|
|
91 |
return X #.tolist() |
|
|
92 |
|
|
|
93 |
def label_sequence(line, MAX_SEQ_LEN, smi_ch_ind): |
|
|
94 |
X = np.zeros(MAX_SEQ_LEN) |
|
|
95 |
|
|
|
96 |
for i, ch in enumerate(line[:MAX_SEQ_LEN]): |
|
|
97 |
X[i] = smi_ch_ind[ch] |
|
|
98 |
|
|
|
99 |
return X #.tolist() |
|
|
100 |
|
|
|
101 |
|
|
|
102 |
|
|
|
103 |
## ######################## ## |
|
|
104 |
# |
|
|
105 |
# DATASET Class |
|
|
106 |
# |
|
|
107 |
## ######################## ## |
|
|
108 |
# works for large dataset |
|
|
109 |
class DataSet(object): |
|
|
110 |
def __init__(self, fpath, setting_no, seqlen, smilen, need_shuffle = False): |
|
|
111 |
self.SEQLEN = seqlen |
|
|
112 |
self.SMILEN = smilen |
|
|
113 |
#self.NCLASSES = n_classes |
|
|
114 |
self.charseqset = CHARPROTSET |
|
|
115 |
self.charseqset_size = CHARPROTLEN |
|
|
116 |
|
|
|
117 |
self.charsmiset = CHARISOSMISET ###HERE CAN BE EDITED |
|
|
118 |
self.charsmiset_size = CHARISOSMILEN |
|
|
119 |
self.PROBLEMSET = setting_no |
|
|
120 |
|
|
|
121 |
# read raw file |
|
|
122 |
# self._raw = self.read_sets( FLAGS) |
|
|
123 |
|
|
|
124 |
# iteration flags |
|
|
125 |
# self._num_data = len(self._raw) |
|
|
126 |
|
|
|
127 |
|
|
|
128 |
def read_sets(self, FLAGS): ### fpath should be the dataset folder /kiba/ or /davis/ |
|
|
129 |
fpath = FLAGS.dataset_path |
|
|
130 |
setting_no = FLAGS.problem_type |
|
|
131 |
print("Reading %s start" % fpath) |
|
|
132 |
|
|
|
133 |
test_fold = json.load(open(fpath + "folds/test_fold_setting" + str(setting_no)+".txt")) |
|
|
134 |
train_folds = json.load(open(fpath + "folds/train_fold_setting" + str(setting_no)+".txt")) |
|
|
135 |
|
|
|
136 |
return test_fold, train_folds |
|
|
137 |
|
|
|
138 |
def parse_data(self, FLAGS, with_label=True): |
|
|
139 |
fpath = FLAGS.dataset_path |
|
|
140 |
print("Read %s start" % fpath) |
|
|
141 |
|
|
|
142 |
ligands = json.load(open(fpath+"ligands_can.txt"), object_pairs_hook=OrderedDict) |
|
|
143 |
proteins = json.load(open(fpath+"proteins.txt"), object_pairs_hook=OrderedDict) |
|
|
144 |
|
|
|
145 |
Y = pickle.load(open(fpath + "Y","rb"), encoding='latin1') ### TODO: read from raw |
|
|
146 |
if FLAGS.is_log: |
|
|
147 |
Y = -(np.log10(Y/(math.pow(10,9)))) |
|
|
148 |
|
|
|
149 |
XD = [] |
|
|
150 |
XT = [] |
|
|
151 |
|
|
|
152 |
if with_label: |
|
|
153 |
for d in ligands.keys(): |
|
|
154 |
XD.append(label_smiles(ligands[d], self.SMILEN, self.charsmiset)) |
|
|
155 |
|
|
|
156 |
for t in proteins.keys(): |
|
|
157 |
XT.append(label_sequence(proteins[t], self.SEQLEN, self.charseqset)) |
|
|
158 |
else: |
|
|
159 |
for d in ligands.keys(): |
|
|
160 |
XD.append(one_hot_smiles(ligands[d], self.SMILEN, self.charsmiset)) |
|
|
161 |
|
|
|
162 |
for t in proteins.keys(): |
|
|
163 |
XT.append(one_hot_sequence(proteins[t], self.SEQLEN, self.charseqset)) |
|
|
164 |
|
|
|
165 |
return XD, XT, Y |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
|
|
|
169 |
|