Switch to unified view

a b/python-scripts/runCancerDNN.py
1
import numpy as np
2
from sklearn.preprocessing import normalize
3
from keras.layers import Input, Dense,concatenate,Dropout,average
4
from keras.models import Model
5
from keras import backend as K
6
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
7
import numpy as np
8
from sklearn.model_selection import StratifiedKFold
9
from keras.layers import Input, Dense,concatenate,Dropout,average
10
from keras.models import Model
11
import keras
12
from sklearn.metrics import classification_report
13
#训练三个神经网络
14
def build_NN_model1(omics):
15
    omics1=omics[0]
16
    omics2=omics[1]
17
    omics3=omics[2]
18
    input1_dim=omics1.shape[1]
19
    input2_dim = omics2.shape[1]
20
    input3_dim = omics3.shape[1]
21
    class_num = 4
22
23
24
    #omics1
25
    input_factor1 = Input(shape=(input1_dim,),name='omics1')
26
    # NN
27
    omics1_nn = Dense(500, activation='relu')(input_factor1)
28
    omics1_nn = Dropout(0.1)(omics1_nn)
29
    # omics1_nn = Dense(500, activation='relu')(omics1_nn)
30
    # omics1_nn = Dropout(0.1)(omics1_nn)
31
    omics1_nn = Dense(100, activation='relu')(omics1_nn)
32
    omics1_nn = Dropout(0.1)(omics1_nn)
33
34
35
    # omics2
36
    input_factor2 = Input(shape=(input2_dim,), name='omics2')
37
    # NN
38
    omics2_nn = Dense(500, activation='relu')(input_factor2)
39
    omics2_nn = Dropout(0.1)(omics2_nn)
40
    # omics2_nn = Dense(100, activation='relu')(omics2_nn)
41
    # omics2_nn = Dropout(0.1)(omics2_nn)
42
    omics2_nn = Dense(100, activation='relu')(omics2_nn)
43
    omics2_nn = Dropout(0.1)(omics2_nn)
44
45
    # omics3
46
    input_factor3 = Input(shape=(input3_dim,), name='omics3')
47
    # NN
48
    omics3_nn = Dense(500, activation='relu')(input_factor2)
49
    omics3_nn = Dropout(0.1)(omics3_nn)
50
    # omics3_nn = Dense(100, activation='relu')(omics3_nn)
51
    # omics3_nn = Dropout(0.1)(omics3_nn)
52
    omics3_nn = Dense(100, activation='relu')(omics3_nn)
53
    omics3_nn = Dropout(0.1)(omics3_nn)
54
55
    mid_concat=concatenate([omics1_nn, omics2_nn, omics3_nn])
56
    # classifier
57
    nn_classifier = Dense(100, activation='relu')(mid_concat)
58
    nn_classifier=Dropout(0.1)(nn_classifier)
59
    nn_classifier = Dense(50, activation='relu')(nn_classifier)
60
    nn_classifier = Dropout(0.1)(nn_classifier)
61
    # nn_classifier = Dense(50, activation='relu')(nn_classifier)
62
    # nn_classifier = Dropout(0.1)(nn_classifier)
63
    nn_classifier = Dense(10, activation='relu')(nn_classifier)
64
    #nn_classifier = Dropout(0.1)(nn_classifier)
65
    nn_classifier = Dense(class_num, activation='softmax', name='classifier')(nn_classifier)
66
    my_metrics = {
67
        'classifier': ['acc']
68
    }
69
    my_loss = {
70
        'classifier': 'categorical_crossentropy', \
71
        }
72
    adam=keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
73
    zlyNN = Model(inputs=[input_factor1,input_factor2,input_factor3], outputs=nn_classifier)
74
    zlyNN.compile(optimizer=adam, loss=my_loss, metrics=my_metrics)
75
    return zlyNN
76
77
78
79
def build_NN_model2(omics,class_num):
80
    
81
    input_dim=omics.shape[1]
82
    
83
    #class_num = 5
84
85
86
    #omics1
87
    input_factor1 = Input(shape=(input_dim,),name='omics')
88
    # NN
89
    omics1_nn = Dense(500, activation='relu')(input_factor1)
90
    omics1_nn = Dropout(0.1)(omics1_nn)
91
    omics1_nn = Dense(100, activation='relu')(omics1_nn)
92
    omics1_nn = Dropout(0.1)(omics1_nn)
93
    # omics1_nn1 = Dense(100, activation='relu')(omics1_nn1)
94
    # omics1_nn1 = Dropout(0.1)(omics1_nn1)
95
    # omics1_nn = Dense(10, activation='relu')(omics1_nn)
96
    # omics1_nn = Dropout(0.1)(omics1_nn)
97
    # omics1_nn = average([omics1_nn1,omics1_nn])
98
    # omics1_nn = Dense(100, activation='relu')(omics1_nn)
99
    # omics1_nn = Dropout(0.1)(omics1_nn)
100
    nn_classifier = Dense(50, activation='relu')(omics1_nn)
101
    # nn_classifier = Dropout(0.1)(nn_classifier)
102
    if class_num==2:
103
        nn_classifier = Dense(1, activation='sigmoid', name='classifier')(nn_classifier)
104
    else:
105
        nn_classifier = Dense(class_num, activation='softmax', name='classifier')(nn_classifier)
106
    my_metrics_multi = {
107
        'classifier': ['acc']
108
    }
109
    my_loss_multi = {
110
        'classifier': 'categorical_crossentropy', \
111
        }
112
    my_metrics_bi = {
113
        'classifier': ['acc']
114
    }
115
    my_loss_bi = {
116
        'classifier': 'binary_crossentropy', \
117
        }
118
    # compile autoencoder
119
    # self.autoencoder.compile(optimizer='adam', loss='mse')
120
    zlyNN = Model(inputs=[input_factor1], outputs=nn_classifier) 
121
    if class_num==2:
122
        zlyNN.compile(optimizer='adam', loss=my_loss_bi, metrics=my_metrics_bi)
123
    else:
124
        zlyNN.compile(optimizer='adam', loss=my_loss_multi, metrics=my_metrics_multi)
125
    return zlyNN
126
127
128
129
if __name__ == '__main__':
130
    # files = ['breast2']
131
    # # files = ['gbm']
132
    # for f in files:
133
    #     datapath='./data/cancer_d2d/{f}'.format(f=f)
134
    #     omics1 = np.loadtxt('{}/after_log_exp.txt'.format(datapath),str)
135
    #     omics1 = np.delete(omics1, 0, axis=1)
136
    #     #omics1 = np.transpose(omics1)
137
    #     omics1 = omics1.astype(np.float)
138
    #     omics1 = normalize(omics1, axis=0, norm='max')
139
    #     print(omics1.shape)
140
    #     omics2 = np.loadtxt('{}/after_log_mirna.txt'.format(datapath),str)
141
    #     omics2= np.delete(omics2, 0, axis=1)
142
    #     #omics2 = np.transpose(omics2)
143
    #     omics2 = omics2.astype(np.float)
144
    #     omics2 = normalize(omics2, axis=0, norm='max')
145
    #     print(omics2.shape)
146
    #     omics3 = np.loadtxt('{}/after_methy.txt'.format(datapath),str)
147
    #     omics3= np.delete(omics3,0,axis=1)
148
    #     #omics3 = np.transpose(omics3)
149
    #     omics3 = omics3.astype(np.float)
150
    #     omics3 = normalize(omics3, axis=0, norm='max')
151
    #     print(omics3.shape)
152
    #     labels = np.loadtxt('{datapath}/after_labels.txt'.format(datapath=datapath), str)
153
    #     labels = np.delete(labels, 0, axis=1)
154
    #     labels = labels.astype(np.int)
155
    #     labels = np.squeeze(labels,axis=1)
156
    #     # k折交叉验证
157
    #     all_acc = []
158
    #     all_f1_macro = []
159
    #     all_f1_weighted = []
160
    #     all_auc_macro = []
161
    #     all_auc_weighted = []
162
    #     #omics = np.loadtxt('./result/nmf/mf_em.txt')
163
    #     omics = np.concatenate((omics1, omics2, omics3), axis=1)
164
    #     #labels = np.loadtxt('./data/BRCA/labels_all.csv', delimiter=',')
165
    #     # data=np.concatenate([])
166
    #     kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
167
    #     for train_ix, test_ix in kfold.split(omics, labels):
168
    #         # select rows
169
    #         train_X, test_X = omics[train_ix], omics[test_ix]
170
    #         train_y, test_y = labels[train_ix], labels[test_ix]
171
    #         # summarize train and test composition
172
    #         unique, count = np.unique(train_y, return_counts=True)
173
    #         train_data_count = dict(zip(unique, count))
174
    #         print('train:' + str(train_data_count))
175
    #         unique, count = np.unique(test_y, return_counts=True)
176
    #         test_data_count = dict(zip(unique, count))
177
    #         print('test:' + str(test_data_count))
178
179
    #         # 多分类的输出
180
    #         train_y = list(np.int_(train_y))
181
    #         # groundtruth = np.int_(groundtruth)
182
    #         y = []
183
    #         num = len(train_y)
184
    #         for i in range(num):
185
    #             tmp = np.zeros(4, dtype='uint8')
186
    #             tmp[train_y[i]] = 1
187
    #             y.append(tmp)
188
    #         train_y = np.array(y)
189
190
    #         test_y = list(np.int_(test_y))
191
    #         # groundtruth = np.int_(groundtruth)
192
    #         y = []
193
    #         num = len(test_y)
194
    #         for i in range(num):
195
    #             tmp = np.zeros(4, dtype='uint8')
196
    #             tmp[test_y[i]] = 1
197
    #             y.append(tmp)
198
    #         test_y = np.array(y)
199
200
    #         model = build_NN_model2(omics, 4)
201
    #         history = model.fit(train_X, train_y, epochs=50, verbose=2, batch_size=8, shuffle=True,
202
    #                             validation_data=(test_X, test_y))
203
    #         y_true = []
204
    #         for i in range(len(test_y)):
205
    #             y_true.append(np.argmax(test_y[i]))
206
    #         predictions = model.predict(test_X)
207
    #         y_pred = []
208
    #         for i in range(len(predictions)):
209
    #             y_pred.append(np.argmax(predictions[i]))
210
    #         acc = accuracy_score(y_true, y_pred)
211
    #         f1_macro = f1_score(y_true, y_pred, average='macro')
212
    #         # f1_micro=f1_score(y_true, y_pred, average='micro')
213
    #         f1_weighted = f1_score(y_true, y_pred, average='weighted')
214
    #         auc_macro = roc_auc_score(y_true, predictions, multi_class='ovr', average='macro')
215
    #         auc_weighted = roc_auc_score(y_true, predictions, multi_class='ovr', average='weighted')
216
    #         all_acc.append(acc)
217
    #         all_f1_macro.append(f1_macro)
218
    #         all_f1_weighted.append(f1_weighted)
219
    #         all_auc_macro.append(auc_macro)
220
    #         all_auc_weighted.append(auc_weighted)
221
222
    #         print(classification_report(y_true, y_pred))
223
    #         print(acc, f1_macro, f1_weighted, auc_macro, auc_weighted)
224
    #         # print_precison_recall_f1(y_true, y_pred)
225
    #     print('caicai' * 20)
226
    #     print(
227
    #         'acc:{all_acc}\nf1_macro:{all_f1_macro}\nf1_weighted:{all_f1_weighted}\nauc_macro:{all_auc_macro}\nauc_weighted:{all_auc_weighted}'. \
228
    #         format(all_acc=all_acc, all_f1_macro=all_f1_macro, all_f1_weighted=all_f1_weighted,
229
    #                all_auc_macro=all_auc_macro, all_auc_weighted=all_auc_weighted))
230
    #     avg_acc = np.mean(all_acc)
231
    #     avg_f1_macro = np.mean(all_f1_macro)
232
    #     avg_f1_weighted = np.mean(all_f1_weighted)
233
    #     avg_auc_macro = np.mean(all_auc_macro)
234
    #     avg_auc_weighted = np.mean(all_auc_weighted)
235
    #     print(
236
    #         'acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\nauc_macro:{avg_auc_macro}\nauc_weighted:{avg_auc_weighted}'. \
237
    #         format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted,
238
    #                avg_auc_macro=avg_auc_macro, avg_auc_weighted=avg_auc_weighted))
239
240
241
242
    #files = ['breast2']
243
    files = ['gbm']
244
    for f in files:
245
        datapath='./data/cancer_d2d/{f}'.format(f=f)
246
        omics1 = np.loadtxt('{}/after_log_exp.txt'.format(datapath),str)
247
        omics1 = np.delete(omics1, 0, axis=1)
248
        #omics1 = np.transpose(omics1)
249
        omics1 = omics1.astype(np.float)
250
        omics1 = normalize(omics1, axis=0, norm='max')
251
        print(omics1.shape)
252
        omics2 = np.loadtxt('{}/after_log_mirna.txt'.format(datapath),str)
253
        omics2= np.delete(omics2, 0, axis=1)
254
        #omics2 = np.transpose(omics2)
255
        omics2 = omics2.astype(np.float)
256
        omics2 = normalize(omics2, axis=0, norm='max')
257
        print(omics2.shape)
258
        omics3 = np.loadtxt('{}/after_methy.txt'.format(datapath),str)
259
        omics3= np.delete(omics3,0,axis=1)
260
        #omics3 = np.transpose(omics3)
261
        omics3 = omics3.astype(np.float)
262
        omics3 = normalize(omics3, axis=0, norm='max')
263
        print(omics3.shape)
264
        labels = np.loadtxt('{datapath}/after_labels.txt'.format(datapath=datapath), str)
265
        labels = np.delete(labels, 0, axis=1)
266
        labels = labels.astype(np.int)
267
        labels = np.squeeze(labels,axis=1)
268
        # datapath = 'data/BRCA'
269
        # omics1 = np.loadtxt('{}/1_all.csv'.format(datapath),delimiter=',')
270
        # #omics1 = np.transpose(omics1)
271
        # omics1 = normalize(omics1, axis=0, norm='max')
272
273
        # omics2 = np.loadtxt('{}/2_all.csv'.format(datapath),delimiter=',')
274
        # #omics2 = np.transpose(omics2)
275
        # omics2 = normalize(omics2, axis=0, norm='max')
276
277
        # omics3 = np.loadtxt('{}/3_all.csv'.format(datapath),delimiter=',')
278
        # #omics3 = np.transpose(omics3)
279
        # omics3 = normalize(omics3, axis=0, norm='max')
280
        
281
        # k折交叉验证
282
        all_acc = []
283
        all_f1_macro = []
284
        all_f1_weighted = []
285
        all_auc_macro = []
286
        all_auc_weighted = []
287
        #omics = np.loadtxt('./result/nmf/mf_em.txt')
288
        omics = np.concatenate((omics1, omics2, omics3), axis=1)
289
        
290
        # labels = np.loadtxt('./data/BRCA/labels_all.csv', delimiter=',')
291
        # data=np.concatenate([])
292
        kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
293
        for train_ix, test_ix in kfold.split(omics1, labels):
294
            omics_tobuild=[omics1,omics2,omics3]
295
            train_X_1=omics1[train_ix]
296
            train_X_2=omics2[train_ix]
297
            train_X_3=omics3[train_ix]
298
299
            test_X_1=omics1[test_ix]
300
            test_X_2=omics2[test_ix]
301
            test_X_3=omics3[test_ix]
302
            # select rows
303
            train_X, test_X = [train_X_1,train_X_2,train_X_3],[test_X_1,test_X_2,test_X_3]
304
            #train_X, test_X = (train_X_1,train_X_2,train_X_3),(test_X_1,test_X_2,test_X_3)
305
            train_y, test_y = labels[train_ix], labels[test_ix]
306
            # summarize train and test composition
307
            unique, count = np.unique(train_y, return_counts=True)
308
            train_data_count = dict(zip(unique, count))
309
            print('train:' + str(train_data_count))
310
            unique, count = np.unique(test_y, return_counts=True)
311
            test_data_count = dict(zip(unique, count))
312
            print('test:' + str(test_data_count))
313
            class_num=4
314
            # 多分类的输出
315
            train_y = list(np.int_(train_y))
316
            # groundtruth = np.int_(groundtruth)
317
            y = []
318
            num = len(train_y)
319
            for i in range(num):
320
                tmp = np.zeros(class_num, dtype='uint8')
321
                tmp[train_y[i]] = 1
322
                y.append(tmp)
323
            train_y = np.array(y)
324
325
            test_y = list(np.int_(test_y))
326
            # groundtruth = np.int_(groundtruth)
327
            y = []
328
            num = len(test_y)
329
            for i in range(num):
330
                tmp = np.zeros(class_num, dtype='uint8')
331
                tmp[test_y[i]] = 1
332
                y.append(tmp)
333
            test_y = np.array(y)
334
335
            model = build_NN_model1(omics_tobuild)
336
            history = model.fit(train_X, train_y, epochs=50, verbose=2, batch_size=16, shuffle=True,
337
                                validation_data=(test_X, test_y))
338
            y_true = []
339
            for i in range(len(test_y)):
340
                y_true.append(np.argmax(test_y[i]))
341
            predictions = model.predict(test_X)
342
            y_pred = []
343
            for i in range(len(predictions)):
344
                y_pred.append(np.argmax(predictions[i]))
345
            acc = accuracy_score(y_true, y_pred)
346
            f1_macro = f1_score(y_true, y_pred, average='macro')
347
            # f1_micro=f1_score(y_true, y_pred, average='micro')
348
            f1_weighted = f1_score(y_true, y_pred, average='weighted')
349
            auc_macro = roc_auc_score(y_true, predictions, multi_class='ovr', average='macro')
350
            auc_weighted = roc_auc_score(y_true, predictions, multi_class='ovr', average='weighted')
351
            all_acc.append(acc)
352
            all_f1_macro.append(f1_macro)
353
            all_f1_weighted.append(f1_weighted)
354
            all_auc_macro.append(auc_macro)
355
            all_auc_weighted.append(auc_weighted)
356
357
            print(classification_report(y_true, y_pred))
358
            print(acc, f1_macro, f1_weighted, auc_macro, auc_weighted)
359
            # print_precison_recall_f1(y_true, y_pred)
360
        print('caicai' * 20)
361
        print(
362
            'acc:{all_acc}\nf1_macro:{all_f1_macro}\nf1_weighted:{all_f1_weighted}\nauc_macro:{all_auc_macro}\nauc_weighted:{all_auc_weighted}'. \
363
            format(all_acc=all_acc, all_f1_macro=all_f1_macro, all_f1_weighted=all_f1_weighted,
364
                   all_auc_macro=all_auc_macro, all_auc_weighted=all_auc_weighted))
365
        avg_acc = np.mean(all_acc)
366
        avg_f1_macro = np.mean(all_f1_macro)
367
        avg_f1_weighted = np.mean(all_f1_weighted)
368
        avg_auc_macro = np.mean(all_auc_macro)
369
        avg_auc_weighted = np.mean(all_auc_weighted)
370
        print(
371
            'acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\nauc_macro:{avg_auc_macro}\nauc_weighted:{avg_auc_weighted}'. \
372
            format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted,
373
                   avg_auc_macro=avg_auc_macro, avg_auc_weighted=avg_auc_weighted))
374
375