Diff of /large_net/test1.py [000000] .. [f8af2c]

Switch to unified view

a b/large_net/test1.py
1
import json
2
import csv
3
import numpy as np
4
import os
5
import pandas as pd
6
import random
7
import tensorflow as tf
8
import matplotlib.pyplot as plt
9
10
import keras
11
from keras.models import Sequential,Model
12
from keras.utils.generic_utils import get_custom_objects
13
from keras import backend as K
14
from keras.layers import Input, Conv1D, Reshape,\
15
    GlobalAveragePooling2D, Dense, BatchNormalization, Activation,AveragePooling1D, \
16
    GlobalMaxPooling2D, Flatten, MaxPool1D, Conv2D,MaxPool2D,SeparableConv2D,Conv3D,Add,Dropout
17
from keras.utils.vis_utils import plot_model
18
#from keras.applications.mobilenet import DepthwiseConv2D
19
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
20
21
from keras.layers.advanced_activations import LeakyReLU, PReLU
22
from functions import quan_detector, most_repeared_promoter,dataset
23
from sklearn.metrics import confusion_matrix
24
import argparse
25
from keras.callbacks import LearningRateScheduler, TensorBoard, ModelCheckpoint
26
from sklearn import metrics
27
from sklearn.metrics import roc_auc_score
28
29
np.random.seed(41)
30
tf.set_random_seed(41)
31
random.seed(41)
32
33
34
labels_file = './labes.csv'
35
labels_df = pd.read_csv(labels_file, index_col=0)
36
ids_csv = labels_df.FID.tolist()
37
38
files_num_chr7 = [1907,2,1908,2780,2428,3,1173,1291]
39
files_num_chr17 = [2264,865,66,69,1931,71,1932,70]
40
files_num_chr9 = [502, 1420, 1503, 1504, 1505, 1506, 1507, 1508]
41
files_num_chr22 = [65,66,0,80,15,16,59,5]
42
43
def fn(d_dir):
44
    file_list = os.listdir(d_dir)
45
    n = [i.split('.')[0] for i in file_list]
46
    num = [int(i.split('_')[1]) for i in n  if len(i)>3]
47
    return num
48
49
50
51
num_7 = len(files_num_chr7)
52
num_9 = len(files_num_chr9)
53
num_17 = len(files_num_chr17)
54
num_22 = len(files_num_chr22)
55
56
files_num = files_num_chr7 + files_num_chr9 + files_num_chr17 + files_num_chr22
57
58
all_HL_prom = np.zeros((11908,64*len(files_num)))
59
for idx in range(len(files_num)):
60
    promoter_num  = files_num[idx]
61
    if idx <num_7:
62
        #promoter_file = './chr7_hl_prom/chr22_'+str(promoter_num)+'.json'
63
        promoter_file = './chr7_hl_prom/chr22_'+str(promoter_num)+'.json'
64
    elif idx <num_7+num_9 and idx >= num_7:
65
        promoter_file = './chr9_hl_prom/chr9_'+str(promoter_num)+'.json'
66
    elif idx <num_7+num_9+ num_17 and idx >= num_7+num_9:
67
        promoter_file = './chr17_hl_prom/chr22_'+str(promoter_num)+'.json'
68
    else:
69
        promoter_file = './chr22_hl_prom/chr22_'+str(promoter_num)+'.json'
70
        
71
    # # read files
72
    with open(promoter_file) as json_data:
73
        ind_var = json.load(json_data)
74
    var_num = []
75
    for i in ids_csv:
76
        id_name = str(i)
77
        temp = ind_var[id_name]
78
        var_seq = [int(t) for t in temp]
79
        var_num.append(var_seq)
80
    all_HL_prom[:,idx*64:(idx+1)*64] =np.array(var_num)
81
82
    
83
print(all_HL_prom.shape)
84
n = len(files_num)
85
86
labels_df['vars'] = all_HL_prom.tolist()
87
lab_num = {1: [1, 0],  # control
88
           2: [0, 1]}  # ALS
89
lab_num_batch = {'c1': [1,0,0,0],  # control
90
           'c3': [0,1,0,0],
91
           'c5': [0,0,1,0],
92
           'c44':[0,0,0,1]}  # ALS
93
94
pheno_new = []
95
pheno_batch = []
96
for i in labels_df.Pheno.tolist():
97
    pheno_new.append(lab_num[i])
98
99
# for i in labels_df.Sex.tolist():
100
#     pheno_new.append(lab_num[i])
101
for i in labels_df.FID.tolist():
102
    l = i.split('-')[0]
103
    pheno_batch.append(lab_num_batch[l])
104
105
d = {"Pheno": pheno_new, "Vars": labels_df.vars}
106
dataset_ = pd.DataFrame(d)
107
dataset_X = np.array(dataset_.Vars.tolist())
108
dataset_Y = np.array(dataset_.Pheno.tolist())
109
110
111
N,M = dataset_X.shape
112
print(N,M)
113
# network accuracy
114
115
t_idx = [int(line.strip()) for line in open("train_id.txt", 'r')]
116
te_idx = [int(line.strip()) for line in open("test_id.txt", 'r')]
117
x_tv = dataset_X[t_idx]
118
y_tv = dataset_Y[t_idx]
119
x_test = dataset_X[te_idx]
120
y_test = dataset_Y[te_idx]
121
122
123
124
x_train, y_train, x_val, y_val,tr_idx,val_idx = dataset(x_tv, y_tv, test_ratio=0.05)
125
126
127
# print x_test.shape
128
num_classes = y_test.shape[-1]
129
x_train = x_train.astype('float32')  
130
x_test = x_test.astype('float32')
131
x_val = x_val.astype('float32')
132
133
x_train = x_train.astype('float32').reshape((len(x_train), M,1))
134
x_test = x_test.astype('float32').reshape((len(x_test),M,1))
135
x_val = x_val.astype('float32').reshape((len(x_val),M,1))
136
137
print(np.sum(y_test,axis=0))
138
print(np.sum(y_val,axis=0))
139
print(np.sum(y_train,axis=0))
140
conv_kwargs = {'padding':'same',
141
               'data_format':'channels_first'}
142
143
mp_kwargs = {'padding':'same',
144
               'data_format':'channels_first'}
145
146
147
def architecture_X(input_shape, num_classes):
148
    act = 'relu'
149
    
150
    x = Input(shape=(M, 1))
151
    input_Conv = Conv1D(filters = 256, kernel_size =64, strides=64)(x)
152
    input_Conv = Conv1D(filters = 256, kernel_size =1,strides=1)(input_Conv)
153
    input_BN = BatchNormalization(epsilon=1e-05)(input_Conv)
154
    input_Conv = Conv1D(filters = 256, kernel_size =1,strides=1)(input_Conv)
155
    input_act = Activation(act)(input_BN)
156
    
157
    
158
    reshape_h = Reshape((int(M/64), 16, 16))(input_act)
159
    # DepthwiseConv2D(32, strides=(2, 2), **conv_kwargs)(x)
160
    
161
    # add traditional conv
162
    conv1 = Conv2D(64, (3, 3),activation=act,**conv_kwargs)(reshape_h)
163
    conv1 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv1)
164
    conv1 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv1)
165
    mp1 = MaxPool2D(pool_size=(2, 2),**mp_kwargs)(conv1)
166
    
167
    Sconv1 = SeparableConv2D(128, (2, 2),activation=act,**conv_kwargs)(mp1)
168
    # Sconv1 = SeparableConv2D(32, (2, 2),activation=act,**conv_kwargs)(Sconv1)
169
    # Sconv1 = SeparableConv2D(32, (2, 2),activation=act,**conv_kwargs)(Sconv1)
170
    Smp1 = MaxPool2D(pool_size=(2, 2),**mp_kwargs)(Sconv1)
171
    
172
    Smp1 = Conv2D(256, (1, 1),activation=act,**conv_kwargs)(Smp1)
173
    
174
    Sconv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Smp1)
175
    Sconv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Sconv2)
176
    Sconv2 = SeparableConv2D(64, (2, 2),activation=act,**conv_kwargs)(Sconv2)
177
    Smp2 = Sconv2#MaxPool2D(pool_size=(2, 2),**mp_kwargs)(Sconv2)
178
    
179
    
180
    conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Smp1)
181
    conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv2)
182
    conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv2)
183
    mp2 = conv2#MaxPool2D(pool_size=(2, 2),**mp_kwargs)(conv2)
184
    
185
    sum_add =keras.layers.Concatenate(axis=-1)([mp2,Smp2])
186
    
187
    conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(sum_add)
188
    conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv3)
189
    conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv3)
190
    conv3 = MaxPool2D(pool_size=(1, 2),**mp_kwargs)(conv3)
191
    
192
    conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3)
193
    conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3)
194
    conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3)
195
    
196
#     new = keras.layers.dot([conv2,conv3], axes = -2)
197
    conv4 = keras.layers.Add()([Sconv2,conv2,conv3])
198
199
    conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4)
200
    conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4)
201
    conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4)
202
#     conv4_ = Conv2D(64, (1, 1),activation=act,**conv_kwargs)(conv4)
203
    
204
    conv5 = keras.layers.Add()([conv4,conv3])
205
    conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5)
206
    conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5)
207
    conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5)
208
#     conv5 = keras.layers.Add()([conv4,conv5])
209
    
210
    mp3 = GlobalAveragePooling2D(data_format='channels_first')(conv5)
211
    
212
    flatten =mp3#Flatten()(mp3)
213
214
    d1 = Dense(64*4, activation='linear')(flatten)
215
    d1_act = Activation(act)(d1)
216
217
    d2 = Dense(16, activation='linear')(d1_act)
218
    flatten = Activation(act)(d2)
219
220
    # added 
221
    # flatten = BatchNormalization(epsilon=1e-05)(flatten)
222
    pred = Dense(num_classes, activation='softmax')(flatten)
223
    # Compile model
224
    model = Model(inputs=[x], outputs=[pred])
225
    model.compile(loss='categorical_crossentropy', 
226
                  optimizer=keras.optimizers.adagrad(lr=0.002,decay=0.0001),#0.0013
227
                  metrics=['accuracy'])
228
    return model
229
def scheduler(epoch):
230
    if epoch < 50:
231
        return 0.1
232
    if epoch < 122:
233
        return 0.01
234
    return 0.003# 0.001
235
236
cnn = architecture_X(M, num_classes)
237
reduce_lr = ReduceLROnPlateau(monitor = 'val_acc',
238
                                  factor = 0.7,
239
                                  patience = 50,
240
                                  min_lr = 0.00001,verbose=1)
241
earlystop = EarlyStopping(monitor='val_acc',patience=25,verbose=0,mode='auto')
242
243
# print cnn.summary()
244
history = cnn.fit(x_train, y_train,
245
                  batch_size=16*2,#640,
246
                  epochs=30,#300,
247
                  verbose=1,callbacks = [reduce_lr],
248
                  validation_data=(x_val, y_val))
249
250
print("=" * 5)
251
print(np.sum(y_test,axis=0))
252
print(cnn.evaluate(x_test,y_test))
253
y_pred = cnn.predict(x_test)
254
y_pred_ = np.argmax(y_pred, axis=1)
255
y_test_num = np.argmax(y_test, axis=1)
256
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred_).ravel()
257
258
acc = (tp + tn) * 1. / (tp + fp + tn + fn)
259
260
# from scikitplot as skplt
261
# skplt.metrics.plot_roc_curve(y_test_num,y_pred)
262
263
print('='*10)
264
print('='*5,'  ','Our network')
265
print('='*10)
266
267
print('test: ', acc)
268
269
ps = tp*1./(tp+fp)
270
rc = tp*1./(tp+fn)
271
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn))
272
print("Pression: ", ps)
273
print("Recall:", rc)
274
print("F1: ",2*(ps*rc)/(ps+rc))
275
276
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test_num, np.max(y_pred,axis=-1))
277
print("auc: ",metrics.auc(fp_rate, tp_rate))
278
279
print('='*10)
280
print('='*5,'  ','LR')
281
print('='*10)
282
from sklearn.linear_model import LogisticRegression
283
284
from sklearn import svm
285
X = x_train.reshape((len(x_train), M))
286
y = y_train.argmax(axis=1).astype('float32')
287
logisticRegr = LogisticRegression(C=10,max_iter=10)
288
logisticRegr.fit(X, y)
289
y_pred = logisticRegr.predict(x_test.reshape((len(x_test), M)))
290
291
y_test_num = np.argmax(y_test, axis=1)
292
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel()
293
294
acc = (tp + tn) * 1. / (tp + fp + tn + fn)
295
ps = tp*1./(tp+fp)
296
rc = tp*1./(tp+fn)
297
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn))
298
print("Pression: ", ps)
299
print("Recall:", rc)
300
print("F1: ",2*(ps*rc)/(ps+rc))
301
302
303
print('='*10)
304
print('='*5,'  ','SVM')
305
print('='*10)
306
from sklearn import svm
307
X = x_train.reshape((len(x_train), M))
308
y = y_train.argmax(axis=1).astype('float32')
309
clf = svm.SVC(gamma=0.001)
310
clf.fit(X, y) 
311
y_pred = clf.predict(x_test.reshape((len(x_test), M)))
312
313
y_test_num = np.argmax(y_test, axis=1)
314
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel()
315
316
acc = (tp + tn) * 1. / (tp + fp + tn + fn)
317
ps = tp*1./(tp+fp)
318
rc = tp*1./(tp+fn)
319
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn))
320
print("Pression: ", ps)
321
print("Recall:", rc)
322
print("F1: ",2*(ps*rc)/(ps+rc))
323
#print("auc: ",roc_auc_score(y_test_num,clf.predict_proba(x_test.reshape((len(x_test), M)))))
324
325
326
print('='*10)
327
print('='*5,'  ','RF')
328
print('='*10)
329
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
330
X = x_train.reshape((len(x_train), M))
331
y = y_train.argmax(axis=1).astype('float32')
332
# Instantiate model with 1000 decision trees
333
rf = RandomForestClassifier(max_depth=5, n_estimators=1000, max_features=100)
334
# Train the model on training data
335
rf.fit(X, y)
336
337
y_pred = rf.predict(x_test.reshape((len(x_test), M)))
338
339
y_test_num = np.argmax(y_test, axis=1)
340
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel()
341
342
acc = (tp + tn) * 1. / (tp + fp + tn + fn)
343
344
ps = tp*1./(tp+fp)
345
rc = tp*1./(tp+fn)
346
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn))
347
print("Pression: ", ps)
348
print("Recall:", rc)
349
print("F1: ",2*(ps*rc)/(ps+rc))
350
351
print('='*10)
352
print('='*5,'  ','AdaBoost')
353
print('='*10)
354
from sklearn.tree import DecisionTreeClassifier
355
X = x_train.reshape((len(x_train), M))
356
y = y_train.argmax(axis=1).astype('float32')
357
# Instantiate model with 1000 decision trees
358
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3),
359
                         algorithm="SAMME.R",
360
                         n_estimators=1000)
361
# Train the model on training data
362
bdt.fit(X, y)
363
364
y_pred = bdt.predict(x_test.reshape((len(x_test), M)))
365
366
y_test_num = np.argmax(y_test, axis=1)
367
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel()
368
369
acc = (tp + tn) * 1. / (tp + fp + tn + fn)
370
371
ps = tp*1./(tp+fp)
372
rc = tp*1./(tp+fn)
373
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn))
374
print("Pression: ", ps)
375
print("Recall:", rc)
376
print("F1: ",2*(ps*rc)/(ps+rc))