Diff of /source/datahelper.py [000000] .. [8af014]

Switch to unified view

a b/source/datahelper.py
1
import sys, re, math, time
2
import numpy as np
3
import matplotlib.pyplot as plt
4
import json
5
import pickle
6
import collections
7
from collections import OrderedDict
8
from matplotlib.pyplot import cm
9
#from keras.preprocessing.sequence import pad_sequences
10
11
12
## ######################## ##
13
#
14
#  Define CHARSET, CHARLEN
15
#
16
## ######################## ## 
17
18
# CHARPROTSET = { 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, \
19
#             'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, \
20
#             'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19, 'X': 20, \
21
#             'O': 20, 'U': 20,
22
#             'B': (2, 11),
23
#             'Z': (3, 13),
24
#             'J': (7, 9) }
25
# CHARPROTLEN = 21
26
27
CHARPROTSET = { "A": 1, "C": 2, "B": 3, "E": 4, "D": 5, "G": 6, 
28
                "F": 7, "I": 8, "H": 9, "K": 10, "M": 11, "L": 12, 
29
                "O": 13, "N": 14, "Q": 15, "P": 16, "S": 17, "R": 18, 
30
                "U": 19, "T": 20, "W": 21, 
31
                "V": 22, "Y": 23, "X": 24, 
32
                "Z": 25 }
33
34
CHARPROTLEN = 25
35
36
CHARCANSMISET = { "#": 1, "%": 2, ")": 3, "(": 4, "+": 5, "-": 6, 
37
             ".": 7, "1": 8, "0": 9, "3": 10, "2": 11, "5": 12, 
38
             "4": 13, "7": 14, "6": 15, "9": 16, "8": 17, "=": 18, 
39
             "A": 19, "C": 20, "B": 21, "E": 22, "D": 23, "G": 24,
40
             "F": 25, "I": 26, "H": 27, "K": 28, "M": 29, "L": 30, 
41
             "O": 31, "N": 32, "P": 33, "S": 34, "R": 35, "U": 36, 
42
             "T": 37, "W": 38, "V": 39, "Y": 40, "[": 41, "Z": 42, 
43
             "]": 43, "_": 44, "a": 45, "c": 46, "b": 47, "e": 48, 
44
             "d": 49, "g": 50, "f": 51, "i": 52, "h": 53, "m": 54, 
45
             "l": 55, "o": 56, "n": 57, "s": 58, "r": 59, "u": 60,
46
             "t": 61, "y": 62}
47
48
CHARCANSMILEN = 62
49
50
CHARISOSMISET = {"#": 29, "%": 30, ")": 31, "(": 1, "+": 32, "-": 33, "/": 34, ".": 2, 
51
                "1": 35, "0": 3, "3": 36, "2": 4, "5": 37, "4": 5, "7": 38, "6": 6, 
52
                "9": 39, "8": 7, "=": 40, "A": 41, "@": 8, "C": 42, "B": 9, "E": 43, 
53
                "D": 10, "G": 44, "F": 11, "I": 45, "H": 12, "K": 46, "M": 47, "L": 13, 
54
                "O": 48, "N": 14, "P": 15, "S": 49, "R": 16, "U": 50, "T": 17, "W": 51, 
55
                "V": 18, "Y": 52, "[": 53, "Z": 19, "]": 54, "\\": 20, "a": 55, "c": 56, 
56
                "b": 21, "e": 57, "d": 22, "g": 58, "f": 23, "i": 59, "h": 24, "m": 60, 
57
                "l": 25, "o": 61, "n": 26, "s": 62, "r": 27, "u": 63, "t": 28, "y": 64}
58
59
CHARISOSMILEN = 64
60
61
62
## ######################## ##
63
#
64
#  Encoding Helpers
65
#
66
## ######################## ## 
67
68
#  Y = -(np.log10(Y/(math.pow(math.e,9))))
69
70
def one_hot_smiles(line, MAX_SMI_LEN, smi_ch_ind):
71
    X = np.zeros((MAX_SMI_LEN, len(smi_ch_ind))) #+1
72
73
    for i, ch in enumerate(line[:MAX_SMI_LEN]):
74
        X[i, (smi_ch_ind[ch]-1)] = 1 
75
76
    return X #.tolist()
77
78
def one_hot_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
79
    X = np.zeros((MAX_SEQ_LEN, len(smi_ch_ind))) 
80
    for i, ch in enumerate(line[:MAX_SEQ_LEN]):
81
        X[i, (smi_ch_ind[ch])-1] = 1
82
83
    return X #.tolist()
84
85
86
def label_smiles(line, MAX_SMI_LEN, smi_ch_ind):
87
    X = np.zeros(MAX_SMI_LEN)
88
    for i, ch in enumerate(line[:MAX_SMI_LEN]): #   x, smi_ch_ind, y
89
        X[i] = smi_ch_ind[ch]
90
91
    return X #.tolist()
92
93
def label_sequence(line, MAX_SEQ_LEN, smi_ch_ind):
94
    X = np.zeros(MAX_SEQ_LEN)
95
96
    for i, ch in enumerate(line[:MAX_SEQ_LEN]):
97
        X[i] = smi_ch_ind[ch]
98
99
    return X #.tolist()
100
101
102
103
## ######################## ##
104
#
105
#  DATASET Class
106
#
107
## ######################## ## 
108
# works for large dataset
109
class DataSet(object):
110
  def __init__(self, fpath, setting_no, seqlen, smilen, need_shuffle = False):
111
    self.SEQLEN = seqlen
112
    self.SMILEN = smilen
113
    #self.NCLASSES = n_classes
114
    self.charseqset = CHARPROTSET
115
    self.charseqset_size = CHARPROTLEN
116
117
    self.charsmiset = CHARISOSMISET ###HERE CAN BE EDITED
118
    self.charsmiset_size = CHARISOSMILEN
119
    self.PROBLEMSET = setting_no
120
121
    # read raw file
122
    # self._raw = self.read_sets( FLAGS)
123
124
    # iteration flags
125
    # self._num_data = len(self._raw)
126
127
128
  def read_sets(self, FLAGS): ### fpath should be the dataset folder /kiba/ or /davis/
129
    fpath = FLAGS.dataset_path
130
    setting_no = FLAGS.problem_type
131
    print("Reading %s start" % fpath)
132
133
    test_fold = json.load(open(fpath + "folds/test_fold_setting" + str(setting_no)+".txt"))
134
    train_folds = json.load(open(fpath + "folds/train_fold_setting" + str(setting_no)+".txt"))
135
    
136
    return test_fold, train_folds
137
138
  def parse_data(self, FLAGS,  with_label=True): 
139
    fpath = FLAGS.dataset_path  
140
    print("Read %s start" % fpath)
141
142
    ligands = json.load(open(fpath+"ligands_can.txt"), object_pairs_hook=OrderedDict)
143
    proteins = json.load(open(fpath+"proteins.txt"), object_pairs_hook=OrderedDict)
144
145
    Y = pickle.load(open(fpath + "Y","rb"), encoding='latin1') ### TODO: read from raw
146
    if FLAGS.is_log:
147
        Y = -(np.log10(Y/(math.pow(10,9))))
148
149
    XD = []
150
    XT = []
151
152
    if with_label:
153
        for d in ligands.keys():
154
            XD.append(label_smiles(ligands[d], self.SMILEN, self.charsmiset))
155
156
        for t in proteins.keys():
157
            XT.append(label_sequence(proteins[t], self.SEQLEN, self.charseqset))
158
    else:
159
        for d in ligands.keys():
160
            XD.append(one_hot_smiles(ligands[d], self.SMILEN, self.charsmiset))
161
162
        for t in proteins.keys():
163
            XT.append(one_hot_sequence(proteins[t], self.SEQLEN, self.charseqset))
164
  
165
    return XD, XT, Y
166
167
168
169