|
a |
|
b/large_net/test1.py |
|
|
1 |
import json |
|
|
2 |
import csv |
|
|
3 |
import numpy as np |
|
|
4 |
import os |
|
|
5 |
import pandas as pd |
|
|
6 |
import random |
|
|
7 |
import tensorflow as tf |
|
|
8 |
import matplotlib.pyplot as plt |
|
|
9 |
|
|
|
10 |
import keras |
|
|
11 |
from keras.models import Sequential,Model |
|
|
12 |
from keras.utils.generic_utils import get_custom_objects |
|
|
13 |
from keras import backend as K |
|
|
14 |
from keras.layers import Input, Conv1D, Reshape,\ |
|
|
15 |
GlobalAveragePooling2D, Dense, BatchNormalization, Activation,AveragePooling1D, \ |
|
|
16 |
GlobalMaxPooling2D, Flatten, MaxPool1D, Conv2D,MaxPool2D,SeparableConv2D,Conv3D,Add,Dropout |
|
|
17 |
from keras.utils.vis_utils import plot_model |
|
|
18 |
#from keras.applications.mobilenet import DepthwiseConv2D |
|
|
19 |
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping |
|
|
20 |
|
|
|
21 |
from keras.layers.advanced_activations import LeakyReLU, PReLU |
|
|
22 |
from functions import quan_detector, most_repeared_promoter,dataset |
|
|
23 |
from sklearn.metrics import confusion_matrix |
|
|
24 |
import argparse |
|
|
25 |
from keras.callbacks import LearningRateScheduler, TensorBoard, ModelCheckpoint |
|
|
26 |
from sklearn import metrics |
|
|
27 |
from sklearn.metrics import roc_auc_score |
|
|
28 |
|
|
|
29 |
np.random.seed(41) |
|
|
30 |
tf.set_random_seed(41) |
|
|
31 |
random.seed(41) |
|
|
32 |
|
|
|
33 |
|
|
|
34 |
labels_file = './labes.csv' |
|
|
35 |
labels_df = pd.read_csv(labels_file, index_col=0) |
|
|
36 |
ids_csv = labels_df.FID.tolist() |
|
|
37 |
|
|
|
38 |
files_num_chr7 = [1907,2,1908,2780,2428,3,1173,1291] |
|
|
39 |
files_num_chr17 = [2264,865,66,69,1931,71,1932,70] |
|
|
40 |
files_num_chr9 = [502, 1420, 1503, 1504, 1505, 1506, 1507, 1508] |
|
|
41 |
files_num_chr22 = [65,66,0,80,15,16,59,5] |
|
|
42 |
|
|
|
43 |
def fn(d_dir): |
|
|
44 |
file_list = os.listdir(d_dir) |
|
|
45 |
n = [i.split('.')[0] for i in file_list] |
|
|
46 |
num = [int(i.split('_')[1]) for i in n if len(i)>3] |
|
|
47 |
return num |
|
|
48 |
|
|
|
49 |
|
|
|
50 |
|
|
|
51 |
num_7 = len(files_num_chr7) |
|
|
52 |
num_9 = len(files_num_chr9) |
|
|
53 |
num_17 = len(files_num_chr17) |
|
|
54 |
num_22 = len(files_num_chr22) |
|
|
55 |
|
|
|
56 |
files_num = files_num_chr7 + files_num_chr9 + files_num_chr17 + files_num_chr22 |
|
|
57 |
|
|
|
58 |
all_HL_prom = np.zeros((11908,64*len(files_num))) |
|
|
59 |
for idx in range(len(files_num)): |
|
|
60 |
promoter_num = files_num[idx] |
|
|
61 |
if idx <num_7: |
|
|
62 |
#promoter_file = './chr7_hl_prom/chr22_'+str(promoter_num)+'.json' |
|
|
63 |
promoter_file = './chr7_hl_prom/chr22_'+str(promoter_num)+'.json' |
|
|
64 |
elif idx <num_7+num_9 and idx >= num_7: |
|
|
65 |
promoter_file = './chr9_hl_prom/chr9_'+str(promoter_num)+'.json' |
|
|
66 |
elif idx <num_7+num_9+ num_17 and idx >= num_7+num_9: |
|
|
67 |
promoter_file = './chr17_hl_prom/chr22_'+str(promoter_num)+'.json' |
|
|
68 |
else: |
|
|
69 |
promoter_file = './chr22_hl_prom/chr22_'+str(promoter_num)+'.json' |
|
|
70 |
|
|
|
71 |
# # read files |
|
|
72 |
with open(promoter_file) as json_data: |
|
|
73 |
ind_var = json.load(json_data) |
|
|
74 |
var_num = [] |
|
|
75 |
for i in ids_csv: |
|
|
76 |
id_name = str(i) |
|
|
77 |
temp = ind_var[id_name] |
|
|
78 |
var_seq = [int(t) for t in temp] |
|
|
79 |
var_num.append(var_seq) |
|
|
80 |
all_HL_prom[:,idx*64:(idx+1)*64] =np.array(var_num) |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
print(all_HL_prom.shape) |
|
|
84 |
n = len(files_num) |
|
|
85 |
|
|
|
86 |
labels_df['vars'] = all_HL_prom.tolist() |
|
|
87 |
lab_num = {1: [1, 0], # control |
|
|
88 |
2: [0, 1]} # ALS |
|
|
89 |
lab_num_batch = {'c1': [1,0,0,0], # control |
|
|
90 |
'c3': [0,1,0,0], |
|
|
91 |
'c5': [0,0,1,0], |
|
|
92 |
'c44':[0,0,0,1]} # ALS |
|
|
93 |
|
|
|
94 |
pheno_new = [] |
|
|
95 |
pheno_batch = [] |
|
|
96 |
for i in labels_df.Pheno.tolist(): |
|
|
97 |
pheno_new.append(lab_num[i]) |
|
|
98 |
|
|
|
99 |
# for i in labels_df.Sex.tolist(): |
|
|
100 |
# pheno_new.append(lab_num[i]) |
|
|
101 |
for i in labels_df.FID.tolist(): |
|
|
102 |
l = i.split('-')[0] |
|
|
103 |
pheno_batch.append(lab_num_batch[l]) |
|
|
104 |
|
|
|
105 |
d = {"Pheno": pheno_new, "Vars": labels_df.vars} |
|
|
106 |
dataset_ = pd.DataFrame(d) |
|
|
107 |
dataset_X = np.array(dataset_.Vars.tolist()) |
|
|
108 |
dataset_Y = np.array(dataset_.Pheno.tolist()) |
|
|
109 |
|
|
|
110 |
|
|
|
111 |
N,M = dataset_X.shape |
|
|
112 |
print(N,M) |
|
|
113 |
# network accuracy |
|
|
114 |
|
|
|
115 |
t_idx = [int(line.strip()) for line in open("train_id.txt", 'r')] |
|
|
116 |
te_idx = [int(line.strip()) for line in open("test_id.txt", 'r')] |
|
|
117 |
x_tv = dataset_X[t_idx] |
|
|
118 |
y_tv = dataset_Y[t_idx] |
|
|
119 |
x_test = dataset_X[te_idx] |
|
|
120 |
y_test = dataset_Y[te_idx] |
|
|
121 |
|
|
|
122 |
|
|
|
123 |
|
|
|
124 |
x_train, y_train, x_val, y_val,tr_idx,val_idx = dataset(x_tv, y_tv, test_ratio=0.05) |
|
|
125 |
|
|
|
126 |
|
|
|
127 |
# print x_test.shape |
|
|
128 |
num_classes = y_test.shape[-1] |
|
|
129 |
x_train = x_train.astype('float32') |
|
|
130 |
x_test = x_test.astype('float32') |
|
|
131 |
x_val = x_val.astype('float32') |
|
|
132 |
|
|
|
133 |
x_train = x_train.astype('float32').reshape((len(x_train), M,1)) |
|
|
134 |
x_test = x_test.astype('float32').reshape((len(x_test),M,1)) |
|
|
135 |
x_val = x_val.astype('float32').reshape((len(x_val),M,1)) |
|
|
136 |
|
|
|
137 |
print(np.sum(y_test,axis=0)) |
|
|
138 |
print(np.sum(y_val,axis=0)) |
|
|
139 |
print(np.sum(y_train,axis=0)) |
|
|
140 |
conv_kwargs = {'padding':'same', |
|
|
141 |
'data_format':'channels_first'} |
|
|
142 |
|
|
|
143 |
mp_kwargs = {'padding':'same', |
|
|
144 |
'data_format':'channels_first'} |
|
|
145 |
|
|
|
146 |
|
|
|
147 |
def architecture_X(input_shape, num_classes): |
|
|
148 |
act = 'relu' |
|
|
149 |
|
|
|
150 |
x = Input(shape=(M, 1)) |
|
|
151 |
input_Conv = Conv1D(filters = 256, kernel_size =64, strides=64)(x) |
|
|
152 |
input_Conv = Conv1D(filters = 256, kernel_size =1,strides=1)(input_Conv) |
|
|
153 |
input_BN = BatchNormalization(epsilon=1e-05)(input_Conv) |
|
|
154 |
input_Conv = Conv1D(filters = 256, kernel_size =1,strides=1)(input_Conv) |
|
|
155 |
input_act = Activation(act)(input_BN) |
|
|
156 |
|
|
|
157 |
|
|
|
158 |
reshape_h = Reshape((int(M/64), 16, 16))(input_act) |
|
|
159 |
# DepthwiseConv2D(32, strides=(2, 2), **conv_kwargs)(x) |
|
|
160 |
|
|
|
161 |
# add traditional conv |
|
|
162 |
conv1 = Conv2D(64, (3, 3),activation=act,**conv_kwargs)(reshape_h) |
|
|
163 |
conv1 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv1) |
|
|
164 |
conv1 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv1) |
|
|
165 |
mp1 = MaxPool2D(pool_size=(2, 2),**mp_kwargs)(conv1) |
|
|
166 |
|
|
|
167 |
Sconv1 = SeparableConv2D(128, (2, 2),activation=act,**conv_kwargs)(mp1) |
|
|
168 |
# Sconv1 = SeparableConv2D(32, (2, 2),activation=act,**conv_kwargs)(Sconv1) |
|
|
169 |
# Sconv1 = SeparableConv2D(32, (2, 2),activation=act,**conv_kwargs)(Sconv1) |
|
|
170 |
Smp1 = MaxPool2D(pool_size=(2, 2),**mp_kwargs)(Sconv1) |
|
|
171 |
|
|
|
172 |
Smp1 = Conv2D(256, (1, 1),activation=act,**conv_kwargs)(Smp1) |
|
|
173 |
|
|
|
174 |
Sconv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Smp1) |
|
|
175 |
Sconv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Sconv2) |
|
|
176 |
Sconv2 = SeparableConv2D(64, (2, 2),activation=act,**conv_kwargs)(Sconv2) |
|
|
177 |
Smp2 = Sconv2#MaxPool2D(pool_size=(2, 2),**mp_kwargs)(Sconv2) |
|
|
178 |
|
|
|
179 |
|
|
|
180 |
conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(Smp1) |
|
|
181 |
conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv2) |
|
|
182 |
conv2 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv2) |
|
|
183 |
mp2 = conv2#MaxPool2D(pool_size=(2, 2),**mp_kwargs)(conv2) |
|
|
184 |
|
|
|
185 |
sum_add =keras.layers.Concatenate(axis=-1)([mp2,Smp2]) |
|
|
186 |
|
|
|
187 |
conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(sum_add) |
|
|
188 |
conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv3) |
|
|
189 |
conv3 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv3) |
|
|
190 |
conv3 = MaxPool2D(pool_size=(1, 2),**mp_kwargs)(conv3) |
|
|
191 |
|
|
|
192 |
conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3) |
|
|
193 |
conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3) |
|
|
194 |
conv3 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv3) |
|
|
195 |
|
|
|
196 |
# new = keras.layers.dot([conv2,conv3], axes = -2) |
|
|
197 |
conv4 = keras.layers.Add()([Sconv2,conv2,conv3]) |
|
|
198 |
|
|
|
199 |
conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4) |
|
|
200 |
conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4) |
|
|
201 |
conv4 = Conv2D(64, (2, 2),activation=act,**conv_kwargs)(conv4) |
|
|
202 |
# conv4_ = Conv2D(64, (1, 1),activation=act,**conv_kwargs)(conv4) |
|
|
203 |
|
|
|
204 |
conv5 = keras.layers.Add()([conv4,conv3]) |
|
|
205 |
conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5) |
|
|
206 |
conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5) |
|
|
207 |
conv5 = Conv2D(128, (2, 2),activation=act,**conv_kwargs)(conv5) |
|
|
208 |
# conv5 = keras.layers.Add()([conv4,conv5]) |
|
|
209 |
|
|
|
210 |
mp3 = GlobalAveragePooling2D(data_format='channels_first')(conv5) |
|
|
211 |
|
|
|
212 |
flatten =mp3#Flatten()(mp3) |
|
|
213 |
|
|
|
214 |
d1 = Dense(64*4, activation='linear')(flatten) |
|
|
215 |
d1_act = Activation(act)(d1) |
|
|
216 |
|
|
|
217 |
d2 = Dense(16, activation='linear')(d1_act) |
|
|
218 |
flatten = Activation(act)(d2) |
|
|
219 |
|
|
|
220 |
# added |
|
|
221 |
# flatten = BatchNormalization(epsilon=1e-05)(flatten) |
|
|
222 |
pred = Dense(num_classes, activation='softmax')(flatten) |
|
|
223 |
# Compile model |
|
|
224 |
model = Model(inputs=[x], outputs=[pred]) |
|
|
225 |
model.compile(loss='categorical_crossentropy', |
|
|
226 |
optimizer=keras.optimizers.adagrad(lr=0.002,decay=0.0001),#0.0013 |
|
|
227 |
metrics=['accuracy']) |
|
|
228 |
return model |
|
|
229 |
def scheduler(epoch): |
|
|
230 |
if epoch < 50: |
|
|
231 |
return 0.1 |
|
|
232 |
if epoch < 122: |
|
|
233 |
return 0.01 |
|
|
234 |
return 0.003# 0.001 |
|
|
235 |
|
|
|
236 |
cnn = architecture_X(M, num_classes) |
|
|
237 |
reduce_lr = ReduceLROnPlateau(monitor = 'val_acc', |
|
|
238 |
factor = 0.7, |
|
|
239 |
patience = 50, |
|
|
240 |
min_lr = 0.00001,verbose=1) |
|
|
241 |
earlystop = EarlyStopping(monitor='val_acc',patience=25,verbose=0,mode='auto') |
|
|
242 |
|
|
|
243 |
# print cnn.summary() |
|
|
244 |
history = cnn.fit(x_train, y_train, |
|
|
245 |
batch_size=16*2,#640, |
|
|
246 |
epochs=30,#300, |
|
|
247 |
verbose=1,callbacks = [reduce_lr], |
|
|
248 |
validation_data=(x_val, y_val)) |
|
|
249 |
|
|
|
250 |
print("=" * 5) |
|
|
251 |
print(np.sum(y_test,axis=0)) |
|
|
252 |
print(cnn.evaluate(x_test,y_test)) |
|
|
253 |
y_pred = cnn.predict(x_test) |
|
|
254 |
y_pred_ = np.argmax(y_pred, axis=1) |
|
|
255 |
y_test_num = np.argmax(y_test, axis=1) |
|
|
256 |
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred_).ravel() |
|
|
257 |
|
|
|
258 |
acc = (tp + tn) * 1. / (tp + fp + tn + fn) |
|
|
259 |
|
|
|
260 |
# from scikitplot as skplt |
|
|
261 |
# skplt.metrics.plot_roc_curve(y_test_num,y_pred) |
|
|
262 |
|
|
|
263 |
print('='*10) |
|
|
264 |
print('='*5,' ','Our network') |
|
|
265 |
print('='*10) |
|
|
266 |
|
|
|
267 |
print('test: ', acc) |
|
|
268 |
|
|
|
269 |
ps = tp*1./(tp+fp) |
|
|
270 |
rc = tp*1./(tp+fn) |
|
|
271 |
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) |
|
|
272 |
print("Pression: ", ps) |
|
|
273 |
print("Recall:", rc) |
|
|
274 |
print("F1: ",2*(ps*rc)/(ps+rc)) |
|
|
275 |
|
|
|
276 |
fp_rate, tp_rate, thresholds = metrics.roc_curve(y_test_num, np.max(y_pred,axis=-1)) |
|
|
277 |
print("auc: ",metrics.auc(fp_rate, tp_rate)) |
|
|
278 |
|
|
|
279 |
print('='*10) |
|
|
280 |
print('='*5,' ','LR') |
|
|
281 |
print('='*10) |
|
|
282 |
from sklearn.linear_model import LogisticRegression |
|
|
283 |
|
|
|
284 |
from sklearn import svm |
|
|
285 |
X = x_train.reshape((len(x_train), M)) |
|
|
286 |
y = y_train.argmax(axis=1).astype('float32') |
|
|
287 |
logisticRegr = LogisticRegression(C=10,max_iter=10) |
|
|
288 |
logisticRegr.fit(X, y) |
|
|
289 |
y_pred = logisticRegr.predict(x_test.reshape((len(x_test), M))) |
|
|
290 |
|
|
|
291 |
y_test_num = np.argmax(y_test, axis=1) |
|
|
292 |
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() |
|
|
293 |
|
|
|
294 |
acc = (tp + tn) * 1. / (tp + fp + tn + fn) |
|
|
295 |
ps = tp*1./(tp+fp) |
|
|
296 |
rc = tp*1./(tp+fn) |
|
|
297 |
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) |
|
|
298 |
print("Pression: ", ps) |
|
|
299 |
print("Recall:", rc) |
|
|
300 |
print("F1: ",2*(ps*rc)/(ps+rc)) |
|
|
301 |
|
|
|
302 |
|
|
|
303 |
print('='*10) |
|
|
304 |
print('='*5,' ','SVM') |
|
|
305 |
print('='*10) |
|
|
306 |
from sklearn import svm |
|
|
307 |
X = x_train.reshape((len(x_train), M)) |
|
|
308 |
y = y_train.argmax(axis=1).astype('float32') |
|
|
309 |
clf = svm.SVC(gamma=0.001) |
|
|
310 |
clf.fit(X, y) |
|
|
311 |
y_pred = clf.predict(x_test.reshape((len(x_test), M))) |
|
|
312 |
|
|
|
313 |
y_test_num = np.argmax(y_test, axis=1) |
|
|
314 |
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() |
|
|
315 |
|
|
|
316 |
acc = (tp + tn) * 1. / (tp + fp + tn + fn) |
|
|
317 |
ps = tp*1./(tp+fp) |
|
|
318 |
rc = tp*1./(tp+fn) |
|
|
319 |
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) |
|
|
320 |
print("Pression: ", ps) |
|
|
321 |
print("Recall:", rc) |
|
|
322 |
print("F1: ",2*(ps*rc)/(ps+rc)) |
|
|
323 |
#print("auc: ",roc_auc_score(y_test_num,clf.predict_proba(x_test.reshape((len(x_test), M))))) |
|
|
324 |
|
|
|
325 |
|
|
|
326 |
print('='*10) |
|
|
327 |
print('='*5,' ','RF') |
|
|
328 |
print('='*10) |
|
|
329 |
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier |
|
|
330 |
X = x_train.reshape((len(x_train), M)) |
|
|
331 |
y = y_train.argmax(axis=1).astype('float32') |
|
|
332 |
# Instantiate model with 1000 decision trees |
|
|
333 |
rf = RandomForestClassifier(max_depth=5, n_estimators=1000, max_features=100) |
|
|
334 |
# Train the model on training data |
|
|
335 |
rf.fit(X, y) |
|
|
336 |
|
|
|
337 |
y_pred = rf.predict(x_test.reshape((len(x_test), M))) |
|
|
338 |
|
|
|
339 |
y_test_num = np.argmax(y_test, axis=1) |
|
|
340 |
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() |
|
|
341 |
|
|
|
342 |
acc = (tp + tn) * 1. / (tp + fp + tn + fn) |
|
|
343 |
|
|
|
344 |
ps = tp*1./(tp+fp) |
|
|
345 |
rc = tp*1./(tp+fn) |
|
|
346 |
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) |
|
|
347 |
print("Pression: ", ps) |
|
|
348 |
print("Recall:", rc) |
|
|
349 |
print("F1: ",2*(ps*rc)/(ps+rc)) |
|
|
350 |
|
|
|
351 |
print('='*10) |
|
|
352 |
print('='*5,' ','AdaBoost') |
|
|
353 |
print('='*10) |
|
|
354 |
from sklearn.tree import DecisionTreeClassifier |
|
|
355 |
X = x_train.reshape((len(x_train), M)) |
|
|
356 |
y = y_train.argmax(axis=1).astype('float32') |
|
|
357 |
# Instantiate model with 1000 decision trees |
|
|
358 |
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=3), |
|
|
359 |
algorithm="SAMME.R", |
|
|
360 |
n_estimators=1000) |
|
|
361 |
# Train the model on training data |
|
|
362 |
bdt.fit(X, y) |
|
|
363 |
|
|
|
364 |
y_pred = bdt.predict(x_test.reshape((len(x_test), M))) |
|
|
365 |
|
|
|
366 |
y_test_num = np.argmax(y_test, axis=1) |
|
|
367 |
tn, fp, fn, tp = confusion_matrix(y_test_num, y_pred).ravel() |
|
|
368 |
|
|
|
369 |
acc = (tp + tn) * 1. / (tp + fp + tn + fn) |
|
|
370 |
|
|
|
371 |
ps = tp*1./(tp+fp) |
|
|
372 |
rc = tp*1./(tp+fn) |
|
|
373 |
print('Accuracy:',(tp+tn)*1./(tp+tn+fp+fn)) |
|
|
374 |
print("Pression: ", ps) |
|
|
375 |
print("Recall:", rc) |
|
|
376 |
print("F1: ",2*(ps*rc)/(ps+rc)) |