Switch to unified view

a b/python-scripts/runSimulationsCNN.py
1
import numpy as np
2
from sklearn.preprocessing import normalize
3
from keras.layers import Input, Dense,concatenate,Dropout,average
4
from keras.models import Model
5
from keras import backend as K
6
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score
7
import numpy as np
8
from sklearn.model_selection import StratifiedKFold
9
from keras.layers import *
10
from keras.models import Model
11
import keras
12
from sklearn.metrics import classification_report
13
from tensorflow.compat.v1 import ConfigProto
14
from tensorflow.compat.v1 import InteractiveSession
15
config = ConfigProto()
16
config.gpu_options.allow_growth = True
17
session = InteractiveSession(config=config)
18
19
# 训练三个神经网络
20
def build_NN_model1(omics, class_num):
21
    omics1 = omics[0]
22
    omics2 = omics[1]
23
    omics3 = omics[2]
24
    input1_dim = omics1.shape[1]
25
    input2_dim = omics2.shape[1]
26
    input3_dim = omics3.shape[1]
27
    # class_num = 4
28
29
    # omics1
30
    input_factor1 = Input(shape=(input1_dim,), name='omics1')
31
    input_re1 = Reshape((-1, 1))(input_factor1)
32
    omics1_cnn = Conv1D(8, (10), activation='relu')(input_re1)
33
    omics1_cnn = MaxPool1D(2)(omics1_cnn)
34
35
    flatten1 = Flatten()(omics1_cnn)
36
37
    # omics2
38
    input_factor2 = Input(shape=(input2_dim,), name='omics2')
39
    input_re2 = Reshape((-1, 1))(input_factor2)
40
    omics2_cnn = Conv1D(8, (10), activation='relu', name='omics2_cnn_1')(input_re2)
41
    omics2_cnn = MaxPool1D(2)(omics2_cnn)
42
43
    flatten2 = Flatten(name='flatten2')(omics2_cnn)
44
45
    # omics3
46
    input_factor3 = Input(shape=(input3_dim,), name='omics3')
47
    input_re3 = Reshape((-1, 1))(input_factor3)
48
    omics3_cnn = Conv1D(8, (10), activation='relu')(input_re3)
49
    omics3_cnn = MaxPool1D(2)(omics3_cnn)
50
51
    flatten3 = Flatten()(omics3_cnn)
52
53
    mid_concat = concatenate([flatten1, flatten2, flatten3])
54
    # classifier
55
    nn_classifier = Dense(100, activation='relu')(mid_concat)
56
    nn_classifier = Dropout(0.1)(nn_classifier)
57
    nn_classifier = Dense(50, activation='relu')(nn_classifier)
58
    nn_classifier = Dropout(0.1)(nn_classifier)
59
    # nn_classifier = Dense(50, activation='relu')(nn_classifier)
60
    # nn_classifier = Dropout(0.1)(nn_classifier)
61
    nn_classifier = Dense(10, activation='relu')(nn_classifier)
62
    # nn_classifier = Dropout(0.1)(nn_classifier)
63
    nn_classifier = Dense(class_num, activation='softmax', name='classifier')(nn_classifier)
64
    my_metrics = {
65
        'classifier': ['acc']
66
    }
67
    my_loss = {
68
        'classifier': 'categorical_crossentropy', \
69
        }
70
    adam = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
71
    zlyNN = Model(inputs=[input_factor1, input_factor2, input_factor3], outputs=nn_classifier)
72
    zlyNN.compile(optimizer=adam, loss=my_loss, metrics=my_metrics)
73
    return zlyNN
74
75
76
def build_NN_model2(omics, class_num):
77
    input_dim = omics.shape[1]
78
79
    # class_num = 5
80
81
    # omics1
82
    input_factor1 = Input(shape=(input_dim,), name='omics')
83
    input_re = Reshape((-1, 1))(input_factor1)
84
    omics1_cnn = Conv1D(16, (10), activation='relu')(input_re)
85
    omics1_cnn = MaxPool1D(10)(omics1_cnn)
86
    omics1_cnn = Conv1D(8, (5), activation='relu')(omics1_cnn)
87
    omics1_cnn = MaxPool1D(2)(omics1_cnn)
88
    flatten = Flatten()(omics1_cnn)
89
    # NN
90
    # omics1_nn = Dense(500, activation='relu')(input_factor1)
91
    # omics1_nn = Dropout(0.1)(omics1_nn)
92
    # omics1_nn = Dense(100, activation='relu')(omics1_nn)
93
    # omics1_nn = Dropout(0.1)(omics1_nn)
94
95
    nn_classifier = Dense(50, activation='relu')(flatten)
96
    # nn_classifier = Dropout(0.1)(nn_classifier)
97
    if class_num == 2:
98
        nn_classifier = Dense(1, activation='sigmoid', name='classifier')(nn_classifier)
99
    else:
100
        nn_classifier = Dense(class_num, activation='softmax', name='classifier')(nn_classifier)
101
    my_metrics_multi = {
102
        'classifier': ['acc']
103
    }
104
    my_loss_multi = {
105
        'classifier': 'categorical_crossentropy', \
106
        }
107
    my_metrics_bi = {
108
        'classifier': ['acc']
109
    }
110
    my_loss_bi = {
111
        'classifier': 'binary_crossentropy', \
112
        }
113
    # compile autoencoder
114
    # self.autoencoder.compile(optimizer='adam', loss='mse')
115
    zlyNN = Model(inputs=[input_factor1], outputs=nn_classifier)
116
    if class_num == 2:
117
        zlyNN.compile(optimizer='adam', loss=my_loss_bi, metrics=my_metrics_bi)
118
    else:
119
        zlyNN.compile(optimizer='adam', loss=my_loss_multi, metrics=my_metrics_multi)
120
    return zlyNN
121
122
if __name__ == '__main__':
123
    # files = ['breast2']
124
    # # files = ['gbm']
125
    # for f in files:
126
    #     datapath='./data/cancer_d2d/{f}'.format(f=f)
127
    #     omics1 = np.loadtxt('{}/after_log_exp.txt'.format(datapath),str)
128
    #     omics1 = np.delete(omics1, 0, axis=1)
129
    #     #omics1 = np.transpose(omics1)
130
    #     omics1 = omics1.astype(np.float)
131
    #     omics1 = normalize(omics1, axis=0, norm='max')
132
    #     print(omics1.shape)
133
    #     omics2 = np.loadtxt('{}/after_log_mirna.txt'.format(datapath),str)
134
    #     omics2= np.delete(omics2, 0, axis=1)
135
    #     #omics2 = np.transpose(omics2)
136
    #     omics2 = omics2.astype(np.float)
137
    #     omics2 = normalize(omics2, axis=0, norm='max')
138
    #     print(omics2.shape)
139
    #     omics3 = np.loadtxt('{}/after_methy.txt'.format(datapath),str)
140
    #     omics3= np.delete(omics3,0,axis=1)
141
    #     #omics3 = np.transpose(omics3)
142
    #     omics3 = omics3.astype(np.float)
143
    #     omics3 = normalize(omics3, axis=0, norm='max')
144
    #     print(omics3.shape)
145
    #     labels = np.loadtxt('{datapath}/after_labels.txt'.format(datapath=datapath), str)
146
    #     labels = np.delete(labels, 0, axis=1)
147
    #     labels = labels.astype(np.int)
148
    #     labels = np.squeeze(labels,axis=1)
149
    #     # k折交叉验证
150
    #     all_acc = []
151
    #     all_f1_macro = []
152
    #     all_f1_weighted = []
153
    #     all_auc_macro = []
154
    #     all_auc_weighted = []
155
    #     #omics = np.loadtxt('./result/nmf/mf_em.txt')
156
    #     omics = np.concatenate((omics1, omics2, omics3), axis=1)
157
    #     #labels = np.loadtxt('./data/BRCA/labels_all.csv', delimiter=',')
158
    #     # data=np.concatenate([])
159
    #     kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
160
    #     for train_ix, test_ix in kfold.split(omics, labels):
161
    #         # select rows
162
    #         train_X, test_X = omics[train_ix], omics[test_ix]
163
    #         train_y, test_y = labels[train_ix], labels[test_ix]
164
    #         # summarize train and test composition
165
    #         unique, count = np.unique(train_y, return_counts=True)
166
    #         train_data_count = dict(zip(unique, count))
167
    #         print('train:' + str(train_data_count))
168
    #         unique, count = np.unique(test_y, return_counts=True)
169
    #         test_data_count = dict(zip(unique, count))
170
    #         print('test:' + str(test_data_count))
171
172
    #         # 多分类的输出
173
    #         train_y = list(np.int_(train_y))
174
    #         # groundtruth = np.int_(groundtruth)
175
    #         y = []
176
    #         num = len(train_y)
177
    #         for i in range(num):
178
    #             tmp = np.zeros(4, dtype='uint8')
179
    #             tmp[train_y[i]] = 1
180
    #             y.append(tmp)
181
    #         train_y = np.array(y)
182
183
    #         test_y = list(np.int_(test_y))
184
    #         # groundtruth = np.int_(groundtruth)
185
    #         y = []
186
    #         num = len(test_y)
187
    #         for i in range(num):
188
    #             tmp = np.zeros(4, dtype='uint8')
189
    #             tmp[test_y[i]] = 1
190
    #             y.append(tmp)
191
    #         test_y = np.array(y)
192
193
    #         model = build_NN_model2(omics, 4)
194
    #         history = model.fit(train_X, train_y, epochs=50, verbose=2, batch_size=8, shuffle=True,
195
    #                             validation_data=(test_X, test_y))
196
    #         y_true = []
197
    #         for i in range(len(test_y)):
198
    #             y_true.append(np.argmax(test_y[i]))
199
    #         predictions = model.predict(test_X)
200
    #         y_pred = []
201
    #         for i in range(len(predictions)):
202
    #             y_pred.append(np.argmax(predictions[i]))
203
    #         acc = accuracy_score(y_true, y_pred)
204
    #         f1_macro = f1_score(y_true, y_pred, average='macro')
205
    #         # f1_micro=f1_score(y_true, y_pred, average='micro')
206
    #         f1_weighted = f1_score(y_true, y_pred, average='weighted')
207
    #         auc_macro = roc_auc_score(y_true, predictions, multi_class='ovr', average='macro')
208
    #         auc_weighted = roc_auc_score(y_true, predictions, multi_class='ovr', average='weighted')
209
    #         all_acc.append(acc)
210
    #         all_f1_macro.append(f1_macro)
211
    #         all_f1_weighted.append(f1_weighted)
212
    #         all_auc_macro.append(auc_macro)
213
    #         all_auc_weighted.append(auc_weighted)
214
215
    #         print(classification_report(y_true, y_pred))
216
    #         print(acc, f1_macro, f1_weighted, auc_macro, auc_weighted)
217
    #         # print_precison_recall_f1(y_true, y_pred)
218
    #     print('caicai' * 20)
219
    #     print(
220
    #         'acc:{all_acc}\nf1_macro:{all_f1_macro}\nf1_weighted:{all_f1_weighted}\nauc_macro:{all_auc_macro}\nauc_weighted:{all_auc_weighted}'. \
221
    #         format(all_acc=all_acc, all_f1_macro=all_f1_macro, all_f1_weighted=all_f1_weighted,
222
    #                all_auc_macro=all_auc_macro, all_auc_weighted=all_auc_weighted))
223
    #     avg_acc = np.mean(all_acc)
224
    #     avg_f1_macro = np.mean(all_f1_macro)
225
    #     avg_f1_weighted = np.mean(all_f1_weighted)
226
    #     avg_auc_macro = np.mean(all_auc_macro)
227
    #     avg_auc_weighted = np.mean(all_auc_weighted)
228
    #     print(
229
    #         'acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\nauc_macro:{avg_auc_macro}\nauc_weighted:{avg_auc_weighted}'. \
230
    #         format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted,
231
    #                avg_auc_macro=avg_auc_macro, avg_auc_weighted=avg_auc_weighted))
232
233
234
235
    
236
    # datatypes=["equal","heterogeneous"]
237
    # typenums=[5,10,15]
238
    # noise_factor=0.5
239
    # savepath='./result/simulations/lfcnn_res1.txt'
240
    # with open(savepath, 'w') as f2:
241
    #     for datatype in datatypes:
242
    #         f2.write(datatype+'\n')
243
    #         for typenum in typenums:
244
    #             f2.write(str(typenum)+'\n')
245
    #             datapath='data/simulations/{}/{}'.format(datatype, typenum)
246
    #             resultpath='result/simulations/{}/{}'.format(datatype, typenum)
247
    #             labels = np.loadtxt('{}/c.txt'.format(datapath))
248
    #             # groundtruth = list(np.int_(groundtruth))
249
    #
250
    #             omics1 = np.loadtxt('{}/o1.txt'.format(datapath))
251
    #             omics1 = np.transpose(omics1)
252
    #             omics1 = normalize(omics1, axis=0, norm='max')
253
    #
254
    #             omics2 = np.loadtxt('{}/o2.txt'.format(datapath))
255
    #             omics2 = np.transpose(omics2)
256
    #             omics2 = normalize(omics2, axis=0, norm='max')
257
    #
258
    #             omics3 = np.loadtxt('{}/o3.txt'.format(datapath))
259
    #             omics3 = np.transpose(omics3)
260
    #             omics3 = normalize(omics3, axis=0, norm='max')
261
    #
262
    #             omics = np.concatenate((omics1, omics2, omics3), axis=1)
263
    #
264
    #             # k折交叉验证
265
    #             all_acc = []
266
    #             all_f1_macro = []
267
    #             all_f1_weighted = []
268
    #
269
    #
270
    #             kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
271
    #             for train_ix, test_ix in kfold.split(omics, labels):
272
    #
273
    #                 omics_tobuild=[omics1,omics2,omics3]
274
    #                 train_X_1=omics1[train_ix]
275
    #                 train_X_2=omics2[train_ix]
276
    #                 train_X_3=omics3[train_ix]
277
    #
278
    #                 test_X_1=omics1[test_ix]
279
    #                 test_X_2=omics2[test_ix]
280
    #                 test_X_3=omics3[test_ix]
281
    #                 # select rows
282
    #                 train_X, test_X = [train_X_1,train_X_2,train_X_3],[test_X_1,test_X_2,test_X_3]
283
    #                 #train_X, test_X = (train_X_1,train_X_2,train_X_3),(test_X_1,test_X_2,test_X_3)
284
    #                 train_y, test_y = labels[train_ix], labels[test_ix]
285
    #                 # summarize train and test composition
286
    #                 unique, count = np.unique(train_y, return_counts=True)
287
    #                 train_data_count = dict(zip(unique, count))
288
    #                 print('train:' + str(train_data_count))
289
    #                 unique, count = np.unique(test_y, return_counts=True)
290
    #                 test_data_count = dict(zip(unique, count))
291
    #                 print('test:' + str(test_data_count))
292
    #
293
    #                 class_num=typenum
294
    #                 # 多分类的输出
295
    #                 train_y = list(np.int_(train_y))
296
    #                 # groundtruth = np.int_(groundtruth)
297
    #                 y = []
298
    #                 num = len(train_y)
299
    #                 for i in range(num):
300
    #                     tmp = np.zeros(class_num, dtype='uint8')
301
    #                     tmp[train_y[i]] = 1
302
    #                     y.append(tmp)
303
    #                 train_y = np.array(y)
304
    #
305
    #                 test_y = list(np.int_(test_y))
306
    #                 # groundtruth = np.int_(groundtruth)
307
    #                 y = []
308
    #                 num = len(test_y)
309
    #                 for i in range(num):
310
    #                     tmp = np.zeros(class_num, dtype='uint8')
311
    #                     tmp[test_y[i]] = 1
312
    #                     y.append(tmp)
313
    #                 test_y = np.array(y)
314
    #
315
    #                 model = build_NN_model1(omics_tobuild,class_num)
316
    #                 model.summary()
317
    #                 history = model.fit(train_X, train_y, epochs=50, verbose=2, batch_size=16, shuffle=True,validation_data=(test_X, test_y))
318
    #                 y_true = []
319
    #                 for i in range(len(test_y)):
320
    #                     y_true.append(np.argmax(test_y[i]))
321
    #                 predictions = model.predict(test_X)
322
    #                 y_pred = []
323
    #                 for i in range(len(predictions)):
324
    #                     y_pred.append(np.argmax(predictions[i]))
325
    #                 acc = accuracy_score(y_true, y_pred)
326
    #                 f1_macro = f1_score(y_true, y_pred, average='macro')
327
    #                 # f1_micro=f1_score(y_true, y_pred, average='micro')
328
    #                 f1_weighted = f1_score(y_true, y_pred, average='weighted')
329
    #                 all_acc.append(acc)
330
    #                 all_f1_macro.append(f1_macro)
331
    #                 all_f1_weighted.append(f1_weighted)
332
    #
333
    #
334
    #                 print(classification_report(y_true, y_pred))
335
    #                 break
336
    #                 # print_precison_recall_f1(y_true, y_pred)
337
    #             print('caicai' * 20)
338
    #             print(
339
    #                 'acc:{all_acc}\nf1_macro:{all_f1_macro}\nf1_weighted:{all_f1_weighted}\n'. \
340
    #                 format(all_acc=all_acc, all_f1_macro=all_f1_macro, all_f1_weighted=all_f1_weighted))
341
    #             avg_acc = np.mean(all_acc)
342
    #             avg_f1_macro = np.mean(all_f1_macro)
343
    #             avg_f1_weighted = np.mean(all_f1_weighted)
344
    #
345
    #             print(
346
    #                 'acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\n'. \
347
    #                 format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted))
348
    #             f2.write('acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\n'. \
349
    #                 format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted))
350
    #         f2.write('*'*20)
351
352
353
    datatypes=["equal","heterogeneous"]
354
    typenums=[5,10,15]
355
    noise_factor=0.5
356
    savepath='./result/simulations/efcnn_res1.txt'
357
    with open(savepath, 'w') as f2:
358
        for datatype in datatypes:
359
            f2.write(datatype+'\n')
360
            for typenum in typenums:
361
                f2.write(str(typenum)+'\n')
362
                datapath='data/simulations/{}/{}'.format(datatype, typenum)
363
                resultpath='result/simulations/{}/{}'.format(datatype, typenum)
364
                labels = np.loadtxt('{}/c.txt'.format(datapath))
365
                # groundtruth = list(np.int_(groundtruth))
366
367
                omics1 = np.loadtxt('{}/o1.txt'.format(datapath))
368
                omics1 = np.transpose(omics1)
369
                omics1 = normalize(omics1, axis=0, norm='max')
370
371
                omics2 = np.loadtxt('{}/o2.txt'.format(datapath))
372
                omics2 = np.transpose(omics2)
373
                omics2 = normalize(omics2, axis=0, norm='max')
374
375
                omics3 = np.loadtxt('{}/o3.txt'.format(datapath))
376
                omics3 = np.transpose(omics3)
377
                omics3 = normalize(omics3, axis=0, norm='max')
378
379
                omics = np.concatenate((omics1, omics2, omics3), axis=1)
380
381
                # k折交叉验证
382
                all_acc = []
383
                all_f1_macro = []
384
                all_f1_weighted = []
385
386
                
387
                kfold = StratifiedKFold(n_splits=4, shuffle=True, random_state=1)
388
                for train_ix, test_ix in kfold.split(omics, labels):
389
                    
390
391
                    train_X, test_X = omics[train_ix], omics[test_ix]
392
                    train_y, test_y = labels[train_ix], labels[test_ix]
393
                    # summarize train and test composition
394
                    unique, count = np.unique(train_y, return_counts=True)
395
                    train_data_count = dict(zip(unique, count))
396
                    print('train:' + str(train_data_count))
397
                    unique, count = np.unique(test_y, return_counts=True)
398
                    test_data_count = dict(zip(unique, count))
399
                    print('test:' + str(test_data_count))
400
401
                    class_num=typenum
402
                    # 多分类的输出
403
                    train_y = list(np.int_(train_y))
404
                    # groundtruth = np.int_(groundtruth)
405
                    y = []
406
                    num = len(train_y)
407
                    for i in range(num):
408
                        tmp = np.zeros(class_num, dtype='uint8')
409
                        tmp[train_y[i]] = 1
410
                        y.append(tmp)
411
                    train_y = np.array(y)
412
413
                    test_y = list(np.int_(test_y))
414
                    # groundtruth = np.int_(groundtruth)
415
                    y = []
416
                    num = len(test_y)
417
                    for i in range(num):
418
                        tmp = np.zeros(class_num, dtype='uint8')
419
                        tmp[test_y[i]] = 1
420
                        y.append(tmp)
421
                    test_y = np.array(y)
422
423
                    model = build_NN_model2(omics, class_num)
424
                    history = model.fit(train_X, train_y, epochs=20, verbose=2, batch_size=8, shuffle=True,
425
                                        validation_data=(test_X, test_y))
426
                    y_true = []
427
                    for i in range(len(test_y)):
428
                        y_true.append(np.argmax(test_y[i]))
429
                    predictions = model.predict(test_X)
430
                    y_pred = []
431
                    for i in range(len(predictions)):
432
                        y_pred.append(np.argmax(predictions[i]))
433
                    acc = accuracy_score(y_true, y_pred)
434
                    f1_macro = f1_score(y_true, y_pred, average='macro')
435
                    # f1_micro=f1_score(y_true, y_pred, average='micro')
436
                    f1_weighted = f1_score(y_true, y_pred, average='weighted')
437
                    all_acc.append(acc)
438
                    all_f1_macro.append(f1_macro)
439
                    all_f1_weighted.append(f1_weighted)
440
441
442
                    print(classification_report(y_true, y_pred))
443
                    break
444
                    # print_precison_recall_f1(y_true, y_pred)
445
                print('caicai' * 20)
446
                print(
447
                    'acc:{all_acc}\nf1_macro:{all_f1_macro}\nf1_weighted:{all_f1_weighted}\n'. \
448
                    format(all_acc=all_acc, all_f1_macro=all_f1_macro, all_f1_weighted=all_f1_weighted))
449
                avg_acc = np.mean(all_acc)
450
                avg_f1_macro = np.mean(all_f1_macro)
451
                avg_f1_weighted = np.mean(all_f1_weighted)
452
453
                print(
454
                    'acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\n'. \
455
                    format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted))
456
                f2.write('acc:{avg_acc}\nf1_macro:{avg_f1_macro}\nf1_weighted:{avg_f1_weighted}\n'. \
457
                    format(avg_acc=avg_acc, avg_f1_macro=avg_f1_macro, avg_f1_weighted=avg_f1_weighted))
458
            f2.write('*'*20)
459
460
461
    
462