Switch to unified view

a b/Cross validation/MOLI Complete/CisplatinTCGA_cvSoftTripletClassifierNetv15.1_Script.py
1
import torch 
2
import torch.nn as nn
3
import torch.nn.functional as F
4
import torch.optim as optim
5
import numpy as np
6
import matplotlib
7
matplotlib.use('Agg')
8
import matplotlib.pyplot as plt
9
import matplotlib.gridspec as gridspec
10
import pandas as pd
11
import math
12
import sklearn.preprocessing as sk
13
import seaborn as sns
14
from sklearn import metrics
15
from sklearn.feature_selection import VarianceThreshold
16
from sklearn.model_selection import train_test_split
17
from utils import AllTripletSelector,HardestNegativeTripletSelector, RandomNegativeTripletSelector, SemihardNegativeTripletSelector # Strategies for selecting triplets within a minibatch
18
from metrics import AverageNonzeroTripletsMetric
19
from torch.utils.data.sampler import WeightedRandomSampler
20
from sklearn.metrics import roc_auc_score
21
from sklearn.metrics import average_precision_score
22
import random
23
from random import randint
24
from sklearn.model_selection import StratifiedKFold
25
26
save_results_to = '/home/hnoghabi/SoftClassifierTripNetv15.1/Cisplatin/'
27
torch.manual_seed(42)
28
29
max_iter = 50
30
31
GDSCE = pd.read_csv("GDSC_exprs.Cisplatin.eb_with.TCGA_exprs.Cisplatin.tsv", 
32
                    sep = "\t", index_col=0, decimal = ",")
33
GDSCE = pd.DataFrame.transpose(GDSCE)
34
35
TCGAE = pd.read_csv("TCGA_exprs.Cisplatin.eb_with.GDSC_exprs.Cisplatin.tsv", 
36
                   sep = "\t", index_col=0, decimal = ",")
37
TCGAE = pd.DataFrame.transpose(TCGAE)
38
39
TCGAM = pd.read_csv("TCGA_mutations.Cisplatin.tsv", 
40
                   sep = "\t", index_col=0, decimal = ".")
41
TCGAM = pd.DataFrame.transpose(TCGAM)
42
TCGAM = TCGAM.loc[:,~TCGAM.columns.duplicated()]
43
44
TCGAC = pd.read_csv("TCGA_CNA.Cisplatin.tsv", 
45
                   sep = "\t", index_col=0, decimal = ".")
46
TCGAC = pd.DataFrame.transpose(TCGAC)
47
TCGAC = TCGAC.loc[:,~TCGAC.columns.duplicated()]
48
49
GDSCM = pd.read_csv("GDSC_mutations.Cisplatin.tsv", 
50
                    sep = "\t", index_col=0, decimal = ".")
51
GDSCM = pd.DataFrame.transpose(GDSCM)
52
GDSCM = GDSCM.loc[:,~GDSCM.columns.duplicated()]
53
54
GDSCC = pd.read_csv("GDSC_CNA.Cisplatin.tsv", 
55
                    sep = "\t", index_col=0, decimal = ".")
56
GDSCC.drop_duplicates(keep='last')
57
GDSCC = pd.DataFrame.transpose(GDSCC)
58
GDSCC = GDSCC.loc[:,~GDSCC.columns.duplicated()]
59
60
selector = VarianceThreshold(0.05)
61
selector.fit_transform(GDSCE)
62
GDSCE = GDSCE[GDSCE.columns[selector.get_support(indices=True)]]
63
64
TCGAC = TCGAC.fillna(0)
65
TCGAC[TCGAC != 0.0] = 1
66
TCGAM = TCGAM.fillna(0)
67
TCGAM[TCGAM != 0.0] = 1
68
GDSCM = GDSCM.fillna(0)
69
GDSCM[GDSCM != 0.0] = 1
70
GDSCC = GDSCC.fillna(0)
71
GDSCC[GDSCC != 0.0] = 1
72
73
ls = set(GDSCE.columns.values).intersection(set(GDSCM.columns.values))
74
ls = set(ls).intersection(set(GDSCC.columns.values))
75
ls = set(ls).intersection(TCGAE.columns)
76
ls = set(ls).intersection(TCGAM.columns)
77
ls = set(ls).intersection(set(TCGAC.columns.values))
78
ls2 = set(GDSCE.index.values).intersection(set(GDSCM.index.values))
79
ls2 = set(ls2).intersection(set(GDSCC.index.values))
80
ls3 = set(TCGAE.index.values).intersection(set(TCGAM.index.values))
81
ls3 = set(ls3).intersection(set(TCGAC.index.values))
82
#ls = pd.unique(ls)
83
84
TCGAE = TCGAE.loc[ls3,ls]
85
TCGAM = TCGAM.loc[ls3,ls]
86
TCGAC = TCGAC.loc[ls3,ls]
87
GDSCE = GDSCE.loc[ls2,ls]
88
GDSCM = GDSCM.loc[ls2,ls]
89
GDSCC = GDSCC.loc[ls2,ls]
90
91
GDSCR = pd.read_csv("GDSC_response.Cisplatin.tsv", 
92
                    sep = "\t", index_col=0, decimal = ",")
93
TCGAR = pd.read_csv("TCGA_response.Cisplatin.tsv", 
94
                       sep = "\t", index_col=0, decimal = ",")
95
96
GDSCR.rename(mapper = str, axis = 'index', inplace = True)
97
GDSCR = GDSCR.loc[ls2,:]
98
#GDSCR.loc[GDSCR.iloc[:,0] == 'R','response'] = 0
99
#GDSCR.loc[GDSCR.iloc[:,0] == 'S','response'] = 1
100
101
TCGAR = TCGAR.loc[ls3,:]
102
#TCGAR.loc[TCGAR.iloc[:,1] == 'R','response'] = 0
103
#TCGAR.loc[TCGAR.iloc[:,1] == 'S','response'] = 1
104
105
d = {"R":0,"S":1}
106
GDSCR["response"] = GDSCR.loc[:,"response"].apply(lambda x: d[x])
107
TCGAR["response"] = TCGAR.loc[:,"response"].apply(lambda x: d[x])
108
109
Y = GDSCR['response'].values
110
#y_test = TCGAR['response'].values
111
112
ls_mb_size = [14, 36, 64]
113
ls_h_dim = [128, 64, 32, 16]
114
ls_marg = [0.5, 1, 1.5]
115
ls_lr = [0.05, 0.01, 0.001, 0.005, 0.0005, 0.0001,0.00005, 0.00001]
116
ls_epoch = [20, 50, 10, 15, 30, 40, 60, 70, 80, 90, 100]
117
ls_rate = [0.5, 0.6, 0.7, 0.8]
118
ls_wd = [0.01, 0.001, 0.1, 0.0001]
119
ls_lam = [0.1, 0.2, 0.3, 0.4]
120
121
skf = StratifiedKFold(n_splits=5, random_state=42)
122
    
123
for iters in range(max_iter):
124
    k = 0
125
    mbs =  random.choice(ls_mb_size)
126
    hdm1 = random.choice(ls_h_dim)
127
    hdm2 = hdm1
128
    hdm3 = hdm1
129
    mrg = random.choice(ls_marg)
130
    lre = random.choice(ls_lr)
131
    lrm = random.choice(ls_lr)
132
    lrc = random.choice(ls_lr)
133
    lrCL = random.choice(ls_lr)
134
    epch = random.choice(ls_epoch)
135
    rate1 = random.choice(ls_rate)
136
    rate2 = random.choice(ls_rate)
137
    rate3 = random.choice(ls_rate)
138
    rate4 = random.choice(ls_rate)    
139
    wd = random.choice(ls_wd)   
140
    lam = random.choice(ls_lam)   
141
142
    for train_index, test_index in skf.split(GDSCE.values, Y):
143
        k = k + 1
144
        X_trainE = GDSCE.values[train_index,:]
145
        X_testE =  GDSCE.values[test_index,:]
146
        X_trainM = GDSCM.values[train_index,:]
147
        X_testM = GDSCM.values[test_index,:]
148
        X_trainC = GDSCC.values[train_index,:]
149
        X_testC = GDSCM.values[test_index,:]
150
        y_trainE = Y[train_index]
151
        y_testE = Y[test_index]
152
        
153
        scalerGDSC = sk.StandardScaler()
154
        scalerGDSC.fit(X_trainE)
155
        X_trainE = scalerGDSC.transform(X_trainE)
156
        X_testE = scalerGDSC.transform(X_testE)
157
158
        X_trainM = np.nan_to_num(X_trainM)
159
        X_trainC = np.nan_to_num(X_trainC)
160
        X_testM = np.nan_to_num(X_testM)
161
        X_testC = np.nan_to_num(X_testC)
162
        
163
        TX_testE = torch.FloatTensor(X_testE)
164
        TX_testM = torch.FloatTensor(X_testM)
165
        TX_testC = torch.FloatTensor(X_testC)
166
        ty_testE = torch.FloatTensor(y_testE.astype(int))
167
        
168
        #Train
169
        class_sample_count = np.array([len(np.where(y_trainE==t)[0]) for t in np.unique(y_trainE)])
170
        weight = 1. / class_sample_count
171
        samples_weight = np.array([weight[t] for t in y_trainE])
172
173
        samples_weight = torch.from_numpy(samples_weight)
174
        sampler = WeightedRandomSampler(samples_weight.type('torch.DoubleTensor'), len(samples_weight), replacement=True)
175
176
        mb_size = mbs
177
178
        trainDataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_trainE), torch.FloatTensor(X_trainM), 
179
                                                      torch.FloatTensor(X_trainC), torch.FloatTensor(y_trainE.astype(int)))
180
181
        trainLoader = torch.utils.data.DataLoader(dataset = trainDataset, batch_size=mb_size, shuffle=False, num_workers=1, sampler = sampler)
182
183
        n_sampE, IE_dim = X_trainE.shape
184
        n_sampM, IM_dim = X_trainM.shape
185
        n_sampC, IC_dim = X_trainC.shape
186
187
        h_dim1 = hdm1
188
        h_dim2 = hdm2
189
        h_dim3 = hdm3        
190
        Z_in = h_dim1 + h_dim2 + h_dim3
191
        marg = mrg
192
        lrE = lre
193
        lrM = lrm
194
        lrC = lrc
195
        epoch = epch
196
197
        costtr = []
198
        auctr = []
199
        costts = []
200
        aucts = []
201
202
        triplet_selector = RandomNegativeTripletSelector(marg)
203
        triplet_selector2 = AllTripletSelector()
204
205
        class AEE(nn.Module):
206
            def __init__(self):
207
                super(AEE, self).__init__()
208
                self.EnE = torch.nn.Sequential(
209
                    nn.Linear(IE_dim, h_dim1),
210
                    nn.BatchNorm1d(h_dim1),
211
                    nn.ReLU(),
212
                    nn.Dropout(rate1))
213
            def forward(self, x):
214
                output = self.EnE(x)
215
                return output
216
217
        class AEM(nn.Module):
218
            def __init__(self):
219
                super(AEM, self).__init__()
220
                self.EnM = torch.nn.Sequential(
221
                    nn.Linear(IM_dim, h_dim2),
222
                    nn.BatchNorm1d(h_dim2),
223
                    nn.ReLU(),
224
                    nn.Dropout(rate2))
225
            def forward(self, x):
226
                output = self.EnM(x)
227
                return output    
228
229
230
        class AEC(nn.Module):
231
            def __init__(self):
232
                super(AEC, self).__init__()
233
                self.EnC = torch.nn.Sequential(
234
                    nn.Linear(IM_dim, h_dim3),
235
                    nn.BatchNorm1d(h_dim3),
236
                    nn.ReLU(),
237
                    nn.Dropout(rate3))
238
            def forward(self, x):
239
                output = self.EnC(x)
240
                return output    
241
242
        class OnlineTriplet(nn.Module):
243
            def __init__(self, marg, triplet_selector):
244
                super(OnlineTriplet, self).__init__()
245
                self.marg = marg
246
                self.triplet_selector = triplet_selector
247
            def forward(self, embeddings, target):
248
                triplets = self.triplet_selector.get_triplets(embeddings, target)
249
                return triplets
250
251
        class OnlineTestTriplet(nn.Module):
252
            def __init__(self, marg, triplet_selector):
253
                super(OnlineTestTriplet, self).__init__()
254
                self.marg = marg
255
                self.triplet_selector = triplet_selector
256
            def forward(self, embeddings, target):
257
                triplets = self.triplet_selector.get_triplets(embeddings, target)
258
                return triplets    
259
260
        class Classifier(nn.Module):
261
            def __init__(self):
262
                super(Classifier, self).__init__()
263
                self.FC = torch.nn.Sequential(
264
                    nn.Linear(Z_in, 1),
265
                    nn.Dropout(rate4),
266
                    nn.Sigmoid())
267
            def forward(self, x):
268
                return self.FC(x)
269
270
        torch.cuda.manual_seed_all(42)
271
272
        AutoencoderE = AEE()
273
        AutoencoderM = AEM()
274
        AutoencoderC = AEC()
275
276
        solverE = optim.Adagrad(AutoencoderE.parameters(), lr=lrE)
277
        solverM = optim.Adagrad(AutoencoderM.parameters(), lr=lrM)
278
        solverC = optim.Adagrad(AutoencoderC.parameters(), lr=lrC)
279
280
        trip_criterion = torch.nn.TripletMarginLoss(margin=marg, p=2)
281
        TripSel = OnlineTriplet(marg, triplet_selector)
282
        TripSel2 = OnlineTestTriplet(marg, triplet_selector2)
283
284
        Clas = Classifier()
285
        SolverClass = optim.Adagrad(Clas.parameters(), lr=lrCL, weight_decay = wd)
286
        C_loss = torch.nn.BCELoss()
287
288
        for it in range(epoch):
289
290
            epoch_cost4 = 0
291
            epoch_cost3 = []
292
            num_minibatches = int(n_sampE / mb_size) 
293
294
            for i, (dataE, dataM, dataC, target) in enumerate(trainLoader):
295
                flag = 0
296
                AutoencoderE.train()
297
                AutoencoderM.train()
298
                AutoencoderC.train()
299
                Clas.train()
300
301
                if torch.mean(target)!=0. and torch.mean(target)!=1.: 
302
                    ZEX = AutoencoderE(dataE)
303
                    ZMX = AutoencoderM(dataM)
304
                    ZCX = AutoencoderC(dataC)
305
306
                    ZT = torch.cat((ZEX, ZMX, ZCX), 1)
307
                    ZT = F.normalize(ZT, p=2, dim=0)
308
                    Pred = Clas(ZT)
309
310
                    Triplets = TripSel2(ZT, target)
311
                    loss = lam * trip_criterion(ZT[Triplets[:,0],:],ZT[Triplets[:,1],:],ZT[Triplets[:,2],:]) + C_loss(Pred,target.view(-1,1))     
312
313
                    y_true = target.view(-1,1)
314
                    y_pred = Pred
315
                    AUC = roc_auc_score(y_true.detach().numpy(),y_pred.detach().numpy()) 
316
317
                    solverE.zero_grad()
318
                    solverM.zero_grad()
319
                    solverC.zero_grad()
320
                    SolverClass.zero_grad()
321
322
                    loss.backward()
323
324
                    solverE.step()
325
                    solverM.step()
326
                    solverC.step()
327
                    SolverClass.step()
328
329
                    epoch_cost4 = epoch_cost4 + (loss / num_minibatches)
330
                    epoch_cost3.append(AUC)
331
                    flag = 1
332
333
            if flag == 1:
334
                costtr.append(torch.mean(epoch_cost4))
335
                auctr.append(np.mean(epoch_cost3))
336
                print('Iter-{}; Total loss: {:.4}'.format(it, loss))
337
338
            with torch.no_grad():
339
340
                AutoencoderE.eval()
341
                AutoencoderM.eval()
342
                AutoencoderC.eval()
343
                Clas.eval()
344
345
                ZET = AutoencoderE(TX_testE)
346
                ZMT = AutoencoderM(TX_testM)
347
                ZCT = AutoencoderC(TX_testC)
348
349
                ZTT = torch.cat((ZET, ZMT, ZCT), 1)
350
                ZTT = F.normalize(ZTT, p=2, dim=0)
351
                PredT = Clas(ZTT)
352
353
                TripletsT = TripSel2(ZTT, ty_testE)
354
                lossT = lam * trip_criterion(ZTT[TripletsT[:,0],:], ZTT[TripletsT[:,1],:], ZTT[TripletsT[:,2],:]) + C_loss(PredT,ty_testE.view(-1,1))
355
356
                y_truet = ty_testE.view(-1,1)
357
                y_predt = PredT
358
                AUCt = roc_auc_score(y_truet.detach().numpy(),y_predt.detach().numpy())        
359
360
                costts.append(lossT)
361
                aucts.append(AUCt)
362
363
        plt.plot(np.squeeze(costtr), '-r',np.squeeze(costts), '-b')
364
        plt.ylabel('Total cost')
365
        plt.xlabel('iterations (per tens)')
366
367
        title = 'C iter = {}, fold = {}, h_dim[1,2,3] = ({},{},{}), marg = {}, lr[E,M,C] = ({}, {}, {}), epoch = {}, rate[1,2,3,4] = ({},{},{},{}), wd = {}, lrCL = {}, lam = {}'.\
368
                      format(iters, k, hdm1, hdm2, hdm3, mrg, lre, lrm, lrc, epch, rate1, rate2, rate3, rate4, wd, lrCL, lam)
369
370
        plt.suptitle(title)
371
        plt.savefig(save_results_to + title + '.png', dpi = 150)
372
        plt.close()
373
374
        plt.plot(np.squeeze(auctr), '-r',np.squeeze(aucts), '-b')
375
        plt.ylabel('AUC')
376
        plt.xlabel('iterations (per tens)')
377
378
        title = 'A iter = {}, fold = {}, h_dim[1,2,3] = ({},{},{}), marg = {}, lr[E,M,C] = ({}, {}, {}), epoch = {}, rate[1,2,3,4] = ({},{},{},{}), wd = {}, lrCL = {}, lam = {}'.\
379
                      format(iters, k, hdm1, hdm2, hdm3, mrg, lre, lrm, lrc, epch, rate1, rate2, rate3, rate4, wd, lrCL, lam)        
380
381
        plt.suptitle(title)
382
        plt.savefig(save_results_to + title + '.png', dpi = 150)
383
        plt.close()