|
a |
|
b/PathCNN.py |
|
|
1 |
from __future__ import print_function |
|
|
2 |
import keras |
|
|
3 |
from keras.datasets import mnist |
|
|
4 |
from keras.models import Sequential |
|
|
5 |
from keras.layers import Dense, Dropout, Flatten |
|
|
6 |
from keras.layers import Conv2D, MaxPooling2D, AveragePooling2D, Input |
|
|
7 |
from keras import backend as K |
|
|
8 |
import pandas as pd |
|
|
9 |
import numpy as np |
|
|
10 |
from sklearn.model_selection import StratifiedKFold |
|
|
11 |
from sklearn.model_selection import train_test_split |
|
|
12 |
from sklearn.metrics import roc_auc_score |
|
|
13 |
from sklearn.utils import class_weight |
|
|
14 |
#from matplotlib import pyplot as plt |
|
|
15 |
from scipy.cluster.hierarchy import dendrogram, linkage |
|
|
16 |
from keras import regularizers |
|
|
17 |
from keras.layers.core import * |
|
|
18 |
from keras.models import Model |
|
|
19 |
from keras.optimizers import Adam |
|
|
20 |
from sklearn.metrics import confusion_matrix |
|
|
21 |
|
|
|
22 |
pca_exp = pd.read_excel("data/PCA_EXP.xlsx", header=None) |
|
|
23 |
pca_cnv = pd.read_excel("data/PCA_CNV.xlsx", header=None) |
|
|
24 |
pca_mt = pd.read_excel("data/PCA_MT.xlsx", header=None) |
|
|
25 |
clinical = pd.read_excel("data/Clinical.xlsx") |
|
|
26 |
|
|
|
27 |
n = len(pca_exp) # sample size: number of Pts |
|
|
28 |
path_n = 146 # number of pathways |
|
|
29 |
pc = 5 # number of PCs |
|
|
30 |
|
|
|
31 |
# data creation-EXP |
|
|
32 |
pca_exp = pca_exp.to_numpy() |
|
|
33 |
exp_data = np.zeros((n, path_n, pc)) |
|
|
34 |
for i in range(n): |
|
|
35 |
for j in range(path_n): |
|
|
36 |
exp_data[i, j, :] = pca_exp[i, j * pc:(j + 1) * pc] |
|
|
37 |
|
|
|
38 |
# data creation-CNV |
|
|
39 |
pca_cnv = pca_cnv.to_numpy() |
|
|
40 |
cnv_data = np.zeros((n, path_n, pc)) |
|
|
41 |
for i in range(n): |
|
|
42 |
for j in range(path_n): |
|
|
43 |
cnv_data[i, j, :] = pca_cnv[i, j * pc:(j + 1) * pc] |
|
|
44 |
|
|
|
45 |
# data creation-MT |
|
|
46 |
pca_mt = pca_mt.to_numpy() |
|
|
47 |
mt_data = np.zeros((n, path_n, pc)) |
|
|
48 |
for i in range(n): |
|
|
49 |
for j in range(path_n): |
|
|
50 |
mt_data[i, j, :] = pca_mt[i, j * pc:(j + 1) * pc] |
|
|
51 |
|
|
|
52 |
# data merge: mRNA expression, CNV, and MT with a specific number of PCs |
|
|
53 |
no_pc = 2 # use the first 2PCs among 5 PCs |
|
|
54 |
all_data = np.zeros((n, path_n, no_pc * 3)) |
|
|
55 |
for i in range(n): |
|
|
56 |
all_data[i, :, :] = np.concatenate((exp_data[i, :, 0:no_pc], cnv_data[i, :, 0:no_pc], mt_data[i, :, 0:no_pc]),axis=1) |
|
|
57 |
|
|
|
58 |
clinical = clinical.to_numpy() |
|
|
59 |
age = clinical[:, 4] |
|
|
60 |
survival = clinical[:, 5] |
|
|
61 |
os_months = clinical[:, 6] |
|
|
62 |
idx0 = np.where((survival == 1) & (os_months <= 24)) |
|
|
63 |
idx1 = np.where(os_months > 24) |
|
|
64 |
|
|
|
65 |
bio_data=clinical[:, 7:10] |
|
|
66 |
all_data = all_data[:, :, :] |
|
|
67 |
|
|
|
68 |
data_0 = all_data[idx0, :, :] |
|
|
69 |
data_0 = data_0[0, :, :, :] |
|
|
70 |
data_1 = all_data[idx1, :, :] |
|
|
71 |
data_1 = data_1[0, :, :, :] |
|
|
72 |
|
|
|
73 |
age_0 = age[idx0] |
|
|
74 |
age_1 = age[idx1] |
|
|
75 |
|
|
|
76 |
bio_data_0 = bio_data[idx0, : ] |
|
|
77 |
bio_data_0 = bio_data_0[0, :, :] |
|
|
78 |
bio_data_1 = bio_data[idx1,:] |
|
|
79 |
bio_data_1 = bio_data_1[0, :, :] |
|
|
80 |
|
|
|
81 |
outcomes_0 = np.zeros(len(idx0[0])) |
|
|
82 |
outcomes_1 = np.ones(len(idx1[0])) |
|
|
83 |
|
|
|
84 |
## data merge |
|
|
85 |
age_r = np.concatenate((age_0, age_1)) |
|
|
86 |
data = np.concatenate((data_0, data_1)) |
|
|
87 |
outcomes = np.concatenate((outcomes_0, outcomes_1)) |
|
|
88 |
bio_data_r=np.concatenate((bio_data_0, bio_data_1)) |
|
|
89 |
|
|
|
90 |
## DEEP LEARNING |
|
|
91 |
batch_size = 64 |
|
|
92 |
epochs = 30 |
|
|
93 |
|
|
|
94 |
num_classes = 2 |
|
|
95 |
num_runs = 30 |
|
|
96 |
k_fold = 5 |
|
|
97 |
|
|
|
98 |
# input image dimension |
|
|
99 |
img_rows, img_cols = 146, 6 |
|
|
100 |
|
|
|
101 |
auc = [] |
|
|
102 |
|
|
|
103 |
all_observed = np.array([]) |
|
|
104 |
all_predicted = np.array([]) |
|
|
105 |
|
|
|
106 |
for i in range(num_runs): |
|
|
107 |
print(i) |
|
|
108 |
# 5-fold cross validation |
|
|
109 |
skf = StratifiedKFold(n_splits=5) |
|
|
110 |
|
|
|
111 |
observed = np.array([]) |
|
|
112 |
predicted = np.array([]) |
|
|
113 |
|
|
|
114 |
observed = np.array([]) |
|
|
115 |
predicted = np.array([]) |
|
|
116 |
|
|
|
117 |
for train1, test in skf.split(data,outcomes): |
|
|
118 |
skf1 = StratifiedKFold(n_splits=5) |
|
|
119 |
train_data=data[train1,:,:] |
|
|
120 |
train_outcomes=outcomes[train1] |
|
|
121 |
|
|
|
122 |
for train, validation in skf1.split(train_data, train_outcomes): |
|
|
123 |
break |
|
|
124 |
|
|
|
125 |
x_train = train_data[train, :, :] |
|
|
126 |
y_train = train_outcomes[train] |
|
|
127 |
|
|
|
128 |
x_validation = train_data[validation, :, :] |
|
|
129 |
y_validation = train_outcomes[validation] |
|
|
130 |
|
|
|
131 |
x_test = data[test, :, :] |
|
|
132 |
y_test = outcomes[test] |
|
|
133 |
|
|
|
134 |
# age |
|
|
135 |
age_r1=age_r[train1] |
|
|
136 |
age_train = age_r1[train] |
|
|
137 |
age_validation = age_r1[validation] |
|
|
138 |
age_test = age_r[test] |
|
|
139 |
|
|
|
140 |
# bio |
|
|
141 |
bio_data_r1 = bio_data_r[train1, :] |
|
|
142 |
bio_data_train = bio_data_r1[train,:] |
|
|
143 |
bio_data_validation = bio_data_r1[validation, :] |
|
|
144 |
bio_data_test = bio_data_r[test,:] |
|
|
145 |
|
|
|
146 |
cli_train=np.concatenate((np.matrix([age_train]).T, bio_data_train), axis=1) |
|
|
147 |
cli_validation = np.concatenate((np.matrix([age_validation]).T, bio_data_validation), axis=1) |
|
|
148 |
cli_test = np.concatenate((np.matrix([age_test]).T, bio_data_test), axis=1) |
|
|
149 |
|
|
|
150 |
x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1) # rgb- add one more dimensionality |
|
|
151 |
x_validation = x_validation.reshape(x_validation.shape[0], img_rows, img_cols, 1) |
|
|
152 |
x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols, 1) |
|
|
153 |
input_shape = (img_rows, img_cols, 1) |
|
|
154 |
|
|
|
155 |
x_train = x_train.astype('float32') |
|
|
156 |
x_validation = x_validation.astype('float32') |
|
|
157 |
x_test = x_test.astype('float32') |
|
|
158 |
|
|
|
159 |
print('x_train shape:', x_train.shape) |
|
|
160 |
print(x_train.shape[0], 'train samples') |
|
|
161 |
print(x_validation.shape[0], 'validation samples') |
|
|
162 |
print(x_test.shape[0], 'test samples') |
|
|
163 |
|
|
|
164 |
y_train = keras.utils.to_categorical(y_train, num_classes) |
|
|
165 |
y_validation = keras.utils.to_categorical(y_validation, num_classes) |
|
|
166 |
y_test = keras.utils.to_categorical(y_test, num_classes) |
|
|
167 |
|
|
|
168 |
image_input = Input(shape=input_shape) |
|
|
169 |
other_data_input = Input(shape=(1,)) |
|
|
170 |
|
|
|
171 |
# First convolution |
|
|
172 |
conv1 = Conv2D(32, kernel_size=(3, 3), |
|
|
173 |
activation='relu', padding='same' |
|
|
174 |
)(image_input) |
|
|
175 |
# Second Convolution |
|
|
176 |
conv2 = Conv2D(64, (3, 3), activation='relu', padding='same' |
|
|
177 |
)(conv1) |
|
|
178 |
conv2 = MaxPooling2D(pool_size=(4, 2))(conv2) |
|
|
179 |
conv2 = Dropout(0.25)(conv2) |
|
|
180 |
first_part_output = Flatten()(conv2) |
|
|
181 |
|
|
|
182 |
merged_model = keras.layers.concatenate([first_part_output, other_data_input]) |
|
|
183 |
merged_model = Dense(64, activation='relu')(merged_model) |
|
|
184 |
merged_model = Dropout(0.5)(merged_model) |
|
|
185 |
|
|
|
186 |
predictions = Dense(num_classes, activation='softmax')(merged_model) |
|
|
187 |
|
|
|
188 |
# Now create the model |
|
|
189 |
model = Model(inputs=[image_input, other_data_input], outputs=predictions) |
|
|
190 |
model.summary() |
|
|
191 |
layers = model.layers |
|
|
192 |
|
|
|
193 |
lr = 0.0001 |
|
|
194 |
beta_1 = 0.9 |
|
|
195 |
beta_2 = 0.999 |
|
|
196 |
optimizer = Adam(lr=lr, beta_1=beta_1, beta_2=beta_2) |
|
|
197 |
|
|
|
198 |
model.compile(optimizer=optimizer, loss='binary_crossentropy', |
|
|
199 |
metrics=['accuracy']) |
|
|
200 |
|
|
|
201 |
class_weighting = {0: 1, 1: 4.2} # class weight |
|
|
202 |
model.fit([x_train, age_train], [y_train], |
|
|
203 |
batch_size=batch_size, |
|
|
204 |
epochs=epochs, |
|
|
205 |
verbose=1, class_weight=class_weighting, |
|
|
206 |
validation_data=([x_validation, age_validation], [y_validation])) |
|
|
207 |
|
|
|
208 |
score = model.predict([x_test, age_test], verbose=0) |
|
|
209 |
|
|
|
210 |
|
|
|
211 |
if len(observed) == 0: |
|
|
212 |
observed = y_test |
|
|
213 |
predicted = score |
|
|
214 |
else: |
|
|
215 |
observed = np.concatenate((observed, y_test)) |
|
|
216 |
predicted = np.concatenate((predicted, score)) |
|
|
217 |
|
|
|
218 |
auc.append(roc_auc_score(observed[:, 0], predicted[:, 0])) |
|
|
219 |
|
|
|
220 |
if len(all_observed) == 0: |
|
|
221 |
all_observed = observed[:, 0] |
|
|
222 |
all_predicted = predicted[:, 0] |
|
|
223 |
else: |
|
|
224 |
all_observed = np.concatenate((all_observed, observed[:, 0])) |
|
|
225 |
all_predicted = np.concatenate((all_predicted, predicted[:, 0])) |
|
|
226 |
|
|
|
227 |
print(auc) |
|
|
228 |
print(auc) |
|
|
229 |
pd.DataFrame(auc).to_csv("results.csv") |