Switch to unified view

a b/python-scripts/runCancerCNN.py
1
import numpy as np
2
from sklearn.preprocessing import normalize
3
from keras.layers import Input, Dense,concatenate,Dropout,average
4
from keras.models import Model
5
from keras import backend as K
6
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
7
import numpy as np
8
from sklearn.model_selection import StratifiedKFold
9
from keras.layers import *
10
from keras.models import Model
11
import keras
12
from sklearn.metrics import classification_report
13
import os
14
# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
15
# os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
16
from tensorflow.compat.v1 import ConfigProto
17
from tensorflow.compat.v1 import InteractiveSession
18
config = ConfigProto()
19
config.gpu_options.allow_growth = True
20
session = InteractiveSession(config=config)
21
#训练三个神经网络
22
def build_NN_model1(omics,class_num):
23
    omics1=omics[0]
24
    omics2=omics[1]
25
    omics3=omics[2]
26
    input1_dim=omics1.shape[1]
27
    input2_dim = omics2.shape[1]
28
    input3_dim = omics3.shape[1]
29
    # class_num = 4
30
31
32
    #omics1
33
    input_factor1 = Input(shape=(input1_dim,),name='omics1')
34
    input_re1 = Reshape((-1, 1))(input_factor1)
35
    omics1_cnn = Conv1D(32, (300), activation='relu')(input_re1)
36
    omics1_cnn = MaxPool1D(100)(omics1_cnn)
37
38
    flatten1 = Flatten()(omics1_cnn)
39
40
41
    # omics2
42
    input_factor2 = Input(shape=(input2_dim,), name='omics2')
43
    input_re2 = Reshape((-1, 1))(input_factor2)
44
    omics2_cnn = Conv1D(32, (100), activation='relu' ,name='omics2_cnn_1')(input_re2)
45
    omics2_cnn = MaxPool1D(50)(omics2_cnn)
46
47
    flatten2 = Flatten(name='flatten2')(omics2_cnn)
48
49
    # omics3
50
    input_factor3 = Input(shape=(input3_dim,), name='omics3')
51
    input_re3 = Reshape((-1, 1))(input_factor3)
52
    omics3_cnn = Conv1D(32, (300), activation='relu')(input_re3)
53
    omics3_cnn = MaxPool1D(100)(omics3_cnn)
54
55
    flatten3 = Flatten()(omics3_cnn)
56
57
    mid_concat=concatenate([flatten1, flatten2, flatten3])
58
    # classifier
59
    nn_classifier = Dense(100, activation='relu')(mid_concat)
60
    nn_classifier=Dropout(0.1)(nn_classifier)
61
    nn_classifier = Dense(50, activation='relu')(nn_classifier)
62
    nn_classifier = Dropout(0.1)(nn_classifier)
63
    # nn_classifier = Dense(50, activation='relu')(nn_classifier)
64
    # nn_classifier = Dropout(0.1)(nn_classifier)
65
    nn_classifier = Dense(10, activation='relu')(nn_classifier)
66
    #nn_classifier = Dropout(0.1)(nn_classifier)
67
    nn_classifier = Dense(class_num, activation='softmax', name='classifier')(nn_classifier)
68
    my_metrics = {
69
        'classifier': ['acc']
70
    }
71
    my_loss = {
72
        'classifier': 'categorical_crossentropy', \
73
        }
74
    adam=keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
75
    zlyNN = Model(inputs=[input_factor1,input_factor2,input_factor3], outputs=nn_classifier)
76
    zlyNN.compile(optimizer=adam, loss=my_loss, metrics=my_metrics)
77
    return zlyNN
78
79
80
81
def build_NN_model2(omics,class_num):
82
    
83
    input_dim=omics.shape[1]
84
    
85
    #class_num = 5
86
87
88
    #omics1
89
    input_factor1 = Input(shape=(input_dim,),name='omics')
90
    input_re=Reshape((-1,1))(input_factor1)
91
    omics1_cnn=Conv1D(32,(1000),activation='relu')(input_re)
92
    omics1_cnn=MaxPool1D(100)(omics1_cnn)
93
    omics1_cnn = Conv1D(16, (50), activation='relu')(omics1_cnn)
94
    omics1_cnn = MaxPool1D(10)(omics1_cnn)
95
    flatten=Flatten()(omics1_cnn)
96
    # NN
97
    # omics1_nn = Dense(500, activation='relu')(input_factor1)
98
    # omics1_nn = Dropout(0.1)(omics1_nn)
99
    # omics1_nn = Dense(100, activation='relu')(omics1_nn)
100
    # omics1_nn = Dropout(0.1)(omics1_nn)
101
102
    nn_classifier = Dense(50, activation='relu')(flatten)
103
    # nn_classifier = Dropout(0.1)(nn_classifier)
104
    if class_num==2:
105
        nn_classifier = Dense(1, activation='sigmoid', name='classifier')(nn_classifier)
106
    else:
107
        nn_classifier = Dense(class_num, activation='softmax', name='classifier')(nn_classifier)
108
    my_metrics_multi = {
109
        'classifier': ['acc']
110
    }
111
    my_loss_multi = {
112
        'classifier': 'categorical_crossentropy', \
113
        }
114
    my_metrics_bi = {
115
        'classifier': ['acc']
116
    }
117
    my_loss_bi = {
118
        'classifier': 'binary_crossentropy', \
119
        }
120
    # compile autoencoder
121
    # self.autoencoder.compile(optimizer='adam', loss='mse')
122
    zlyNN = Model(inputs=[input_factor1], outputs=nn_classifier) 
123
    if class_num==2:
124
        zlyNN.compile(optimizer='adam', loss=my_loss_bi, metrics=my_metrics_bi)
125
    else:
126
        zlyNN.compile(optimizer='adam', loss=my_loss_multi, metrics=my_metrics_multi)
127
    return zlyNN
128
129
130
131
if __name__ == '__main__':
132
    files = ['breast2']
133
    files = ['gbm']
134
    # files = ['sarcoma2']
135
    files = ['LUAD2']
136
    files = ['STAD2']
137
    for f in files:
138
        datapath='./data/cancer_d2d/{f}'.format(f=f)
139
        omics1 = np.loadtxt('{}/after_log_exp.txt'.format(datapath),str)
140
        omics1 = np.delete(omics1, 0, axis=1)
141
        #omics1 = np.transpose(omics1)
142
        omics1 = omics1.astype(np.float)
143
        omics1 = normalize(omics1, axis=0, norm='max')
144
        print(omics1.shape)
145
        omics2 = np.loadtxt('{}/after_log_mirna.txt'.format(datapath),str)
146
        omics2= np.delete(omics2, 0, axis=1)
147
        #omics2 = np.transpose(omics2)
148
        omics2 = omics2.astype(np.float)
149
        omics2 = normalize(omics2, axis=0, norm='max')
150
        print(omics2.shape)
151
        omics3 = np.loadtxt('{}/after_methy.txt'.format(datapath),str)
152
        omics3= np.delete(omics3,0,axis=1)
153
        #omics3 = np.transpose(omics3)
154
        omics3 = omics3.astype(np.float)
155
        omics3 = normalize(omics3, axis=0, norm='max')
156
        print(omics3.shape)
157
        labels = np.loadtxt('{datapath}/after_labels.txt'.format(datapath=datapath), str)
158
        labels = np.delete(labels, 0, axis=1)
159
        labels = labels.astype(np.int)
160
        labels = np.squeeze(labels,axis=1)
161
        # k折交叉验证
162
        all_acc = []
163
        all_f1_macro = []
164
        all_f1_weighted = []
165
        all_auc_macro = []
166
        all_auc_weighted = []
167
        #omics = np.loadtxt('./result/nmf/mf_em.txt')
168
        omics = np.concatenate((omics1, omics2, omics3), axis=1)
169
        #labels = np.loadtxt('./data/BRCA/labels_all.csv', delimiter=',')
170
        # data=np.concatenate([])
171
        kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
172
        for train_ix, test_ix in kfold.split(omics, labels):
173
            # select rows
174
            train_X, test_X = omics[train_ix], omics[test_ix]
175
            train_y, test_y = labels[train_ix], labels[test_ix]
176
            # summarize train and test composition
177
            unique, count = np.unique(train_y, return_counts=True)
178
            train_data_count = dict(zip(unique, count))
179
            print('train:' + str(train_data_count))
180
            unique, count = np.unique(test_y, return_counts=True)
181
            test_data_count = dict(zip(unique, count))
182
            print('test:' + str(test_data_count))
183
            class_num = 4
184
185
            # 多分类的输出
186
            train_y = list(np.int_(train_y))
187
            # groundtruth = np.int_(groundtruth)
188
            y = []
189
            num = len(train_y)
190
            for i in range(num):
191
                tmp = np.zeros(class_num, dtype='uint8')
192
                tmp[train_y[i]] = 1
193
                y.append(tmp)
194
            train_y = np.array(y)
195
196
            test_y = list(np.int_(test_y))
197
            # groundtruth = np.int_(groundtruth)
198
            y = []
199
            num = len(test_y)
200
            for i in range(num):
201
                tmp = np.zeros(class_num, dtype='uint8')
202
                tmp[test_y[i]] = 1
203
                y.append(tmp)
204
            test_y = np.array(y)
205
206
            model = build_NN_model2(omics, class_num)
207
            model.summary()
208
            history = model.fit(train_X, train_y, epochs=50, verbose=2, batch_size=8, shuffle=True,
209
                                validation_data=(test_X, test_y))
210
            y_true = []
211
            for i in range(len(test_y)):
212
                y_true.append(np.argmax(test_y[i]))
213
            predictions = model.predict(test_X)
214
            y_pred = []
215
            for i in range(len(predictions)):
216
                y_pred.append(np.argmax(predictions[i]))
217
            acc = accuracy_score(y_true, y_pred)
218
            f1_macro = f1_score(y_true, y_pred, average='macro')
219
            # f1_micro=f1_score(y_true, y_pred, average='micro')
220
            f1_weighted = f1_score(y_true, y_pred, average='weighted')
221
            auc_macro = roc_auc_score(y_true, predictions, multi_class='ovr', average='macro')
222
            auc_weighted = roc_auc_score(y_true, predictions, multi_class='ovr', average='weighted')
223
            all_acc.append(acc)
224
            all_f1_macro.append(f1_macro)
225
            all_f1_weighted.append(f1_weighted)
226
            all_auc_macro.append(auc_macro)
227
            all_auc_weighted.append(auc_weighted)
228
229
            print(classification_report(y_true, y_pred))
230
            print(acc, f1_macro, f1_weighted, auc_macro, auc_weighted)
231
            # print_precison_recall_f1(y_true, y_pred)
232
        print('caicai' * 20)
233
        print(
234
            'acc:{all_acc}\nf1_macro:{all_f1_macro}\nf1_weighted:{all_f1_weighted}\nauc_macro:{all_auc_macro}\nauc_weighted:{all_auc_weighted}'. \
235
            format(all_acc=all_acc, all_f1_macro=all_f1_macro, all_f1_weighted=all_f1_weighted,
236
                   all_auc_macro=all_auc_macro, all_auc_weighted=all_auc_weighted))
237
        avg_acc = np.mean(all_acc)
238
        avg_f1_macro = np.mean(all_f1_macro)
239
        avg_f1_weighted = np.mean(all_f1_weighted)
240
        avg_auc_macro = np.mean(all_auc_macro)
241
        avg_auc_weighted = np.mean(all_auc_weighted)
242
        print(
243
            'acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\nauc_macro:{avg_auc_macro}\nauc_weighted:{avg_auc_weighted}'. \
244
            format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted,
245
                   avg_auc_macro=avg_auc_macro, avg_auc_weighted=avg_auc_weighted))
246
247
248
249
    # files = ['breast2']
250
    # # files = ['gbm']
251
    # # files = ['sarcoma2']
252
    # files = ['LUAD2']
253
    # files = ['STAD2']
254
    # for f in files:
255
    #     datapath='./data/cancer_d2d/{f}'.format(f=f)
256
    #     omics1 = np.loadtxt('{}/after_log_exp.txt'.format(datapath),str)
257
    #     omics1 = np.delete(omics1, 0, axis=1)
258
    #     #omics1 = np.transpose(omics1)
259
    #     omics1 = omics1.astype(np.float)
260
    #     omics1 = normalize(omics1, axis=0, norm='max')
261
    #     print(omics1.shape)
262
    #     omics2 = np.loadtxt('{}/after_log_mirna.txt'.format(datapath),str)
263
    #     omics2= np.delete(omics2, 0, axis=1)
264
    #     #omics2 = np.transpose(omics2)
265
    #     omics2 = omics2.astype(np.float)
266
    #     omics2 = normalize(omics2, axis=0, norm='max')
267
    #     print(omics2.shape)
268
    #     omics3 = np.loadtxt('{}/after_methy.txt'.format(datapath),str)
269
    #     omics3= np.delete(omics3,0,axis=1)
270
    #     #omics3 = np.transpose(omics3)
271
    #     omics3 = omics3.astype(np.float)
272
    #     omics3 = normalize(omics3, axis=0, norm='max')
273
    #     print(omics3.shape)
274
    #     labels = np.loadtxt('{datapath}/after_labels.txt'.format(datapath=datapath), str)
275
    #     labels = np.delete(labels, 0, axis=1)
276
    #     labels = labels.astype(np.int)
277
    #     labels = np.squeeze(labels,axis=1)
278
    #
279
    #
280
    #     # k折交叉验证
281
    #     all_acc = []
282
    #     all_f1_macro = []
283
    #     all_f1_weighted = []
284
    #     all_auc_macro = []
285
    #     all_auc_weighted = []
286
    #
287
    #     omics = np.concatenate((omics1, omics2, omics3), axis=1)
288
    #     unique, count = np.unique(labels, return_counts=True)
289
    #     all_count = dict(zip(unique, count))
290
    #     print(str(all_count))
291
    #     # labels = np.loadtxt('./data/BRCA/labels_all.csv', delimiter=',')
292
    #     # data=np.concatenate([])
293
    #     kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
294
    #     for train_ix, test_ix in kfold.split(omics1, labels):
295
    #         omics_tobuild=[omics1,omics2,omics3]
296
    #         train_X_1=omics1[train_ix]
297
    #         train_X_2=omics2[train_ix]
298
    #         train_X_3=omics3[train_ix]
299
    #
300
    #         test_X_1=omics1[test_ix]
301
    #         test_X_2=omics2[test_ix]
302
    #         test_X_3=omics3[test_ix]
303
    #         # select rows
304
    #         train_X, test_X = [train_X_1,train_X_2,train_X_3],[test_X_1,test_X_2,test_X_3]
305
    #         #train_X, test_X = (train_X_1,train_X_2,train_X_3),(test_X_1,test_X_2,test_X_3)
306
    #         train_y, test_y = labels[train_ix], labels[test_ix]
307
    #         # summarize train and test composition
308
    #         unique, count = np.unique(train_y, return_counts=True)
309
    #         train_data_count = dict(zip(unique, count))
310
    #         print('train:' + str(train_data_count))
311
    #         unique, count = np.unique(test_y, return_counts=True)
312
    #         test_data_count = dict(zip(unique, count))
313
    #         print('test:' + str(test_data_count))
314
    #         class_num=4
315
    #         # 多分类的输出
316
    #         train_y = list(np.int_(train_y))
317
    #         # groundtruth = np.int_(groundtruth)
318
    #         y = []
319
    #         num = len(train_y)
320
    #         for i in range(num):
321
    #             tmp = np.zeros(class_num, dtype='uint8')
322
    #             tmp[train_y[i]] = 1
323
    #             y.append(tmp)
324
    #         train_y = np.array(y)
325
    #
326
    #         test_y = list(np.int_(test_y))
327
    #         # groundtruth = np.int_(groundtruth)
328
    #         y = []
329
    #         num = len(test_y)
330
    #         for i in range(num):
331
    #             tmp = np.zeros(class_num, dtype='uint8')
332
    #             tmp[test_y[i]] = 1
333
    #             y.append(tmp)
334
    #         test_y = np.array(y)
335
    #
336
    #         model = build_NN_model1(omics_tobuild,class_num)
337
    #         model.summary()
338
    #         history = model.fit(train_X, train_y, epochs=50, verbose=2, batch_size=16, shuffle=True,
339
    #                             validation_data=(test_X, test_y))
340
    #         y_true = []
341
    #         for i in range(len(test_y)):
342
    #             y_true.append(np.argmax(test_y[i]))
343
    #         predictions = model.predict(test_X)
344
    #         y_pred = []
345
    #         for i in range(len(predictions)):
346
    #             y_pred.append(np.argmax(predictions[i]))
347
    #         acc = accuracy_score(y_true, y_pred)
348
    #         f1_macro = f1_score(y_true, y_pred, average='macro')
349
    #         # f1_micro=f1_score(y_true, y_pred, average='micro')
350
    #         f1_weighted = f1_score(y_true, y_pred, average='weighted')
351
    #         auc_macro = roc_auc_score(y_true, predictions, multi_class='ovr', average='macro')
352
    #         auc_weighted = roc_auc_score(y_true, predictions, multi_class='ovr', average='weighted')
353
    #         all_acc.append(acc)
354
    #         all_f1_macro.append(f1_macro)
355
    #         all_f1_weighted.append(f1_weighted)
356
    #         all_auc_macro.append(auc_macro)
357
    #         all_auc_weighted.append(auc_weighted)
358
    #
359
    #         print(classification_report(y_true, y_pred))
360
    #         print(acc, f1_macro, f1_weighted, auc_macro, auc_weighted)
361
    #         # print_precison_recall_f1(y_true, y_pred)
362
    #     print('caicai' * 20)
363
    #     print(
364
    #         'acc:{all_acc}\nf1_macro:{all_f1_macro}\nf1_weighted:{all_f1_weighted}\nauc_macro:{all_auc_macro}\nauc_weighted:{all_auc_weighted}'. \
365
    #         format(all_acc=all_acc, all_f1_macro=all_f1_macro, all_f1_weighted=all_f1_weighted,
366
    #                all_auc_macro=all_auc_macro, all_auc_weighted=all_auc_weighted))
367
    #     avg_acc = np.mean(all_acc)
368
    #     avg_f1_macro = np.mean(all_f1_macro)
369
    #     avg_f1_weighted = np.mean(all_f1_weighted)
370
    #     avg_auc_macro = np.mean(all_auc_macro)
371
    #     avg_auc_weighted = np.mean(all_auc_weighted)
372
    #     print(
373
    #         'acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\nauc_macro:{avg_auc_macro}\nauc_weighted:{avg_auc_weighted}'. \
374
    #         format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted,
375
    #                avg_auc_macro=avg_auc_macro, avg_auc_weighted=avg_auc_weighted))
376
377