a b/Cross validation/MOLI only classifier/DocetaxelTCGA_cvClassifierNetv8_ScriptCPU.py
1
import torch 
2
import torch.nn as nn
3
import torch.nn.functional as F
4
import torch.optim as optim
5
import numpy as np
6
import matplotlib
7
matplotlib.use('Agg')
8
import matplotlib.pyplot as plt
9
import matplotlib.gridspec as gridspec
10
import pandas as pd
11
import math
12
import sklearn.preprocessing as sk
13
import seaborn as sns
14
from sklearn import metrics
15
from sklearn.feature_selection import VarianceThreshold
16
from sklearn.model_selection import train_test_split
17
from torch.utils.data.sampler import WeightedRandomSampler
18
from sklearn.metrics import roc_auc_score
19
from sklearn.metrics import average_precision_score
20
import random
21
from sklearn.model_selection import StratifiedKFold
22
23
save_results_to = '/home/hnoghabi/CVClassifierResultsv8/Docetaxel/'
24
max_iter = 50
25
torch.manual_seed(42)
26
27
# Load Mutation
28
GDSCE = pd.read_csv("GDSC_exprs.Docetaxel.eb_with.TCGA_exprs.Docetaxel.tsv", 
29
                    sep = "\t", index_col=0, decimal = ",")
30
GDSCE = pd.DataFrame.transpose(GDSCE)
31
32
TCGAE = pd.read_csv("TCGA_exprs.Docetaxel.eb_with.GDSC_exprs.Docetaxel.tsv", 
33
                   sep = "\t", index_col=0, decimal = ",")
34
TCGAE = pd.DataFrame.transpose(TCGAE)
35
36
TCGAM = pd.read_csv("TCGA_mutations.Docetaxel.tsv", 
37
                   sep = "\t", index_col=0, decimal = ".")
38
TCGAM = pd.DataFrame.transpose(TCGAM)
39
TCGAM = TCGAM.loc[:,~TCGAM.columns.duplicated()]
40
41
TCGAC = pd.read_csv("TCGA_CNA.Docetaxel.tsv", 
42
                   sep = "\t", index_col=0, decimal = ".")
43
TCGAC = pd.DataFrame.transpose(TCGAC)
44
TCGAC = TCGAC.loc[:,~TCGAC.columns.duplicated()]
45
46
GDSCM = pd.read_csv("GDSC_mutations.Docetaxel.tsv", 
47
                    sep = "\t", index_col=0, decimal = ".")
48
GDSCM = pd.DataFrame.transpose(GDSCM)
49
GDSCM = GDSCM.loc[:,~GDSCM.columns.duplicated()]
50
51
GDSCC = pd.read_csv("GDSC_CNA.Docetaxel.tsv", 
52
                    sep = "\t", index_col=0, decimal = ".")
53
GDSCC.drop_duplicates(keep='last')
54
GDSCC = pd.DataFrame.transpose(GDSCC)
55
GDSCC = GDSCC.loc[:,~GDSCC.columns.duplicated()]
56
57
selector = VarianceThreshold(0.05)
58
selector.fit_transform(GDSCE)
59
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]
60
61
TCGAC = TCGAC.fillna(0)
62
TCGAC[TCGAC != 0.0] = 1
63
TCGAM = TCGAM.fillna(0)
64
TCGAM[TCGAM != 0.0] = 1
65
GDSCM = GDSCM.fillna(0)
66
GDSCM[GDSCM != 0.0] = 1
67
GDSCC = GDSCC.fillna(0)
68
GDSCC[GDSCC != 0.0] = 1
69
70
ls = set(GDSCE.columns.values).intersection(set(GDSCM.columns.values))
71
ls = set(ls).intersection(set(GDSCC.columns.values))
72
ls = set(ls).intersection(TCGAE.columns)
73
ls = set(ls).intersection(TCGAM.columns)
74
ls = set(ls).intersection(set(TCGAC.columns.values))
75
ls2 = set(GDSCE.index.values).intersection(set(GDSCM.index.values))
76
ls2 = set(ls2).intersection(set(GDSCC.index.values))
77
ls3 = set(TCGAE.index.values).intersection(set(TCGAM.index.values))
78
ls3 = set(ls3).intersection(set(TCGAC.index.values))
79
#ls = pd.unique(ls)
80
81
TCGAE = TCGAE.loc[ls3,ls]
82
TCGAM = TCGAM.loc[ls3,ls]
83
TCGAC = TCGAC.loc[ls3,ls]
84
GDSCE = GDSCE.loc[ls2,ls]
85
GDSCM = GDSCM.loc[ls2,ls]
86
GDSCC = GDSCC.loc[ls2,ls]
87
88
GDSCR = pd.read_csv("GDSC_response.Docetaxel.tsv", 
89
                    sep = "\t", index_col=0, decimal = ",")
90
TCGAR = pd.read_csv("TCGA_response.Docetaxel.tsv", 
91
                       sep = "\t", index_col=0, decimal = ",")
92
93
GDSCR.rename(mapper = str, axis = 'index', inplace = True)
94
GDSCR = GDSCR.loc[ls2,:]
95
#GDSCR.loc[GDSCR.iloc[:,0] == 'R','response'] = 0
96
#GDSCR.loc[GDSCR.iloc[:,0] == 'S','response'] = 1
97
98
TCGAR = TCGAR.loc[ls3,:]
99
#TCGAR.loc[TCGAR.iloc[:,1] == 'R','response'] = 0
100
#TCGAR.loc[TCGAR.iloc[:,1] == 'S','response'] = 1
101
102
d = {"R":0,"S":1}
103
GDSCR["response"] = GDSCR.loc[:,"response"].apply(lambda x: d[x])
104
TCGAR["response"] = TCGAR.loc[:,"response"].apply(lambda x: d[x])
105
106
Y = GDSCR['response'].values
107
108
ls_mb_size = [30, 36, 60]
109
ls_h_dim = [1024, 512, 256, 128, 64, 32, 16]
110
ls_z_dim = [128, 64, 32, 16]
111
ls_marg = [0.5, 1, 1.5, 2, 2.5]
112
ls_lr = [0.5, 0.1, 0.05, 0.01, 0.001, 0.005, 0.0005, 0.0001,0.00005, 0.00001]
113
ls_epoch = [20, 50, 10, 15, 30, 40, 60, 70, 80, 90, 100]
114
ls_rate = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
115
ls_wd = [0.01, 0.001, 0.1, 0.0001]
116
117
skf = StratifiedKFold(n_splits=7, random_state=42)
118
    
119
for iters in range(max_iter):
120
    k = 0
121
    mbs = random.choice(ls_mb_size)
122
    hdm = random.choice(ls_h_dim)
123
    zdm = random.choice(ls_z_dim)
124
    lre = random.choice(ls_lr)
125
    lrm = random.choice(ls_lr)
126
    lrc = random.choice(ls_lr)
127
    lrCL = random.choice(ls_lr)
128
    epch = random.choice(ls_epoch)
129
    wd = random.choice(ls_wd)
130
    rate = random.choice(ls_rate)
131
    
132
    for train_index, test_index in skf.split(GDSCE.values, Y):
133
        k = k + 1
134
        X_trainE = GDSCE.values[train_index,:]
135
        X_testE =  GDSCE.values[test_index,:]
136
        X_trainM = GDSCM.values[train_index,:]
137
        X_testM = GDSCM.values[test_index,:]
138
        X_trainC = GDSCC.values[train_index,:]
139
        X_testC = GDSCM.values[test_index,:]
140
        y_trainE = Y[train_index]
141
        y_testE = Y[test_index]
142
        
143
        scalerGDSC = sk.StandardScaler()
144
        scalerGDSC.fit(X_trainE)
145
        X_trainE = scalerGDSC.transform(X_trainE)
146
        X_testE = scalerGDSC.transform(X_testE)
147
148
        X_trainM = np.nan_to_num(X_trainM)
149
        X_trainC = np.nan_to_num(X_trainC)
150
        X_testM = np.nan_to_num(X_testM)
151
        X_testC = np.nan_to_num(X_testC)
152
        
153
        TX_testE = torch.FloatTensor(X_testE)
154
        TX_testM = torch.FloatTensor(X_testM)
155
        TX_testC = torch.FloatTensor(X_testC)
156
        ty_testE = torch.FloatTensor(y_testE.astype(int))
157
        
158
        #Train
159
        class_sample_count = np.array([len(np.where(y_trainE==t)[0]) for t in np.unique(y_trainE)])
160
        weight = 1. / class_sample_count
161
        samples_weight = np.array([weight[t] for t in y_trainE])
162
163
        samples_weight = torch.from_numpy(samples_weight)
164
        sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight), replacement=True)
165
166
        mb_size = mbs
167
168
        trainDataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_trainE), torch.FloatTensor(X_trainM), 
169
                                                      torch.FloatTensor(X_trainC), torch.FloatTensor(y_trainE.astype(int)))
170
171
        trainLoader = torch.utils.data.DataLoader(dataset = trainDataset, batch_size=mb_size, shuffle=False, num_workers=1, sampler = sampler)
172
173
        n_sampE, IE_dim = X_trainE.shape
174
        n_sampM, IM_dim = X_trainM.shape
175
        n_sampC, IC_dim = X_trainC.shape
176
177
        h_dim = hdm
178
        Z_dim = zdm
179
        Z_in = h_dim + h_dim + h_dim
180
        lrE = lre
181
        lrM = lrm
182
        lrC = lrc
183
        epoch = epch
184
185
        costtr = []
186
        auctr = []
187
        costts = []
188
        aucts = []
189
190
        class AEE(nn.Module):
191
            def __init__(self):
192
                super(AEE, self).__init__()
193
                self.EnE = torch.nn.Sequential(
194
                    nn.Linear(IE_dim, h_dim),
195
                    nn.BatchNorm1d(h_dim),
196
                    nn.ReLU(),
197
                    nn.Dropout())
198
            def forward(self, x):
199
                output = self.EnE(x)
200
                return output
201
202
        class AEM(nn.Module):
203
            def __init__(self):
204
                super(AEM, self).__init__()
205
                self.EnM = torch.nn.Sequential(
206
                    nn.Linear(IM_dim, h_dim),
207
                    nn.BatchNorm1d(h_dim),
208
                    nn.ReLU(),
209
                    nn.Dropout())
210
            def forward(self, x):
211
                output = self.EnM(x)
212
                return output    
213
214
215
        class AEC(nn.Module):
216
            def __init__(self):
217
                super(AEC, self).__init__()
218
                self.EnC = torch.nn.Sequential(
219
                    nn.Linear(IM_dim, h_dim),
220
                    nn.BatchNorm1d(h_dim),
221
                    nn.ReLU(),
222
                    nn.Dropout())
223
            def forward(self, x):
224
                output = self.EnC(x)
225
                return output      
226
        class Classifier(nn.Module):
227
            def __init__(self):
228
                super(Classifier, self).__init__()
229
                self.FC = torch.nn.Sequential(
230
                    nn.Linear(Z_in, Z_dim),
231
                    nn.ReLU(),
232
                    nn.Dropout(rate),
233
                    nn.Linear(Z_dim, 1),
234
                    nn.Dropout(rate),
235
                    nn.Sigmoid())
236
            def forward(self, x):
237
                return self.FC(x)
238
        
239
        torch.cuda.manual_seed_all(42)
240
241
        AutoencoderE = AEE()
242
        AutoencoderM = AEM()
243
        AutoencoderC = AEC()
244
245
        solverE = optim.Adagrad(AutoencoderE.parameters(), lr=lrE)
246
        solverM = optim.Adagrad(AutoencoderM.parameters(), lr=lrM)
247
        solverC = optim.Adagrad(AutoencoderC.parameters(), lr=lrC)
248
249
        Clas = Classifier()
250
        SolverClass = optim.SGD(Clas.parameters(), lr=lrCL, weight_decay = wd)
251
        C_loss = torch.nn.BCELoss()
252
253
        for it in range(epoch):
254
255
            epoch_cost4 = 0
256
            epoch_cost3 = []
257
            num_minibatches = int(n_sampE / mb_size) 
258
259
            for i, (dataE, dataM, dataC, target) in enumerate(trainLoader):
260
                flag = 0
261
                AutoencoderE.train()
262
                AutoencoderM.train()
263
                AutoencoderC.train()
264
                Clas.train()
265
                
266
                if torch.mean(target)!=0. and torch.mean(target)!=1.:                      
267
268
                    ZEX = AutoencoderE(dataE)
269
                    ZMX = AutoencoderM(dataM)
270
                    ZCX = AutoencoderC(dataC)
271
272
                    ZT = torch.cat((ZEX, ZMX, ZCX), 1)
273
                    ZT = F.normalize(ZT, p=2, dim=0)
274
275
                    Pred = Clas(ZT)
276
                    loss = C_loss(Pred,target.view(-1,1))   
277
278
                    y_true = target.view(-1,1)
279
                    y_pred = Pred
280
                    AUC = roc_auc_score(y_true.detach().numpy(),y_pred.detach().numpy()) 
281
282
                    solverE.zero_grad()
283
                    solverM.zero_grad()
284
                    solverC.zero_grad()
285
                    SolverClass.zero_grad()
286
287
                    loss.backward()
288
289
                    solverE.step()
290
                    solverM.step()
291
                    solverC.step()
292
                    SolverClass.step()
293
                    
294
                    epoch_cost4 = epoch_cost4 + (loss / num_minibatches)
295
                    epoch_cost3.append(AUC)
296
                    flag = 1
297
298
            if flag == 1:
299
                costtr.append(torch.mean(epoch_cost4))
300
                auctr.append(np.mean(epoch_cost3))
301
                print('Iter-{}; Total loss: {:.4}'.format(it, loss))
302
303
            with torch.no_grad():
304
305
                AutoencoderE.eval()
306
                AutoencoderM.eval()
307
                AutoencoderC.eval()
308
                Clas.eval()
309
310
                ZET = AutoencoderE(TX_testE)
311
                ZMT = AutoencoderM(TX_testM)
312
                ZCT = AutoencoderC(TX_testC)
313
314
                ZTT = torch.cat((ZET, ZMT, ZCT), 1)
315
                ZTT = F.normalize(ZTT, p=2, dim=0)
316
317
                PredT = Clas(ZTT)
318
                lossT = C_loss(PredT,ty_testE.view(-1,1))         
319
320
                y_truet = ty_testE.view(-1,1)
321
                y_predt = PredT
322
                AUCt = roc_auc_score(y_truet.detach().numpy(),y_predt.detach().numpy())
323
324
                costts.append(lossT)
325
                aucts.append(AUCt)
326
                
327
        plt.plot(np.squeeze(costtr), '-r',np.squeeze(costts), '-b')
328
        plt.ylabel('Total cost')
329
        plt.xlabel('iterations (per tens)')
330
331
        title = 'Cost Docetaxel iter = {}, fold = {}, mb_size = {},  hz_dim[1,2] = ({},{}), lr[E,M,C] = ({}, {}, {}), epoch = {}, wd = {}, lrCL = {}, rate4 = {}'.\
332
                      format(iters, k, mbs, hdm, zdm , lre, lrm, lrc, epch, wd, lrCL, rate)
333
334
        plt.suptitle(title)
335
        plt.savefig(save_results_to + title + '.png', dpi = 150)
336
        plt.close()
337
338
        plt.plot(np.squeeze(auctr), '-r',np.squeeze(aucts), '-b')
339
        plt.ylabel('AUC')
340
        plt.xlabel('iterations (per tens)')
341
342
        title = 'AUC Docetaxel iter = {}, fold = {}, mb_size = {},  hz_dim[1,2] = ({},{}), lr[E,M,C] = ({}, {}, {}), epoch = {}, wd = {}, lrCL = {}, rate4 = {}'.\
343
                      format(iters, k, mbs, hdm, zdm , lre, lrm, lrc, epch, wd, lrCL, rate)        
344
345
        plt.suptitle(title)
346
        plt.savefig(save_results_to + title + '.png', dpi = 150)
347
        plt.close()