|
a |
|
b/GP/Deep_genomic.py |
|
|
1 |
""" |
|
|
2 |
Deep_genomic.py allows predicting complex traits by using Deep Learning and Penalized Liner models |
|
|
3 |
Authors: Laura M Zingaretti (m.lau.zingaretti@gmail.com) and Miguel Perez-Enciso (miguel.perez@uab.es) |
|
|
4 |
""" |
|
|
5 |
''' |
|
|
6 |
Copyright (C) 2019 Laura Zingaretti |
|
|
7 |
This program is free software: you can redistribute it and/or modify |
|
|
8 |
it under the terms of the GNU General Public License as published by |
|
|
9 |
the Free Software Foundation, either version 3 of the License, or |
|
|
10 |
(at your option) any later version. |
|
|
11 |
This program is distributed in the hope that it will be useful, |
|
|
12 |
but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
13 |
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
14 |
GNU General Public License for more details. |
|
|
15 |
You should have received a copy of the GNU General Public License |
|
|
16 |
along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
|
17 |
''' |
|
|
18 |
|
|
|
19 |
import matplotlib as mpl |
|
|
20 |
mpl.use('Agg') |
|
|
21 |
from CNN.cnn import acc_pearson_r as acc_pearson_r |
|
|
22 |
import tensorflow as tf |
|
|
23 |
from keras.backend.tensorflow_backend import set_session |
|
|
24 |
import pandas as pd |
|
|
25 |
import numpy as np |
|
|
26 |
from run_cnn import run_cnn_main |
|
|
27 |
from run_mlp import run_mlp_main |
|
|
28 |
from run_ridge import run_ridge_main |
|
|
29 |
from run_lasso import run_lasso_main |
|
|
30 |
#from run_rnn import run_rnn_main |
|
|
31 |
import seaborn as sns |
|
|
32 |
from sklearn.model_selection import train_test_split |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
from scipy import stats |
|
|
36 |
from sklearn.decomposition import PCA |
|
|
37 |
from sklearn.preprocessing import StandardScaler |
|
|
38 |
from sklearn.preprocessing import scale |
|
|
39 |
|
|
|
40 |
|
|
|
41 |
from sklearn.preprocessing import LabelEncoder |
|
|
42 |
from sklearn.preprocessing import OneHotEncoder |
|
|
43 |
from keras.utils import to_categorical |
|
|
44 |
|
|
|
45 |
|
|
|
46 |
import os |
|
|
47 |
|
|
|
48 |
import sys |
|
|
49 |
import argparse |
|
|
50 |
import logging |
|
|
51 |
|
|
|
52 |
|
|
|
53 |
if __name__ == '__main__': |
|
|
54 |
|
|
|
55 |
parser = argparse.ArgumentParser() |
|
|
56 |
parser.add_argument("--Xtr", required=True, help="train Predictor matrix") |
|
|
57 |
parser.add_argument("--ytr", required=True,help= "train response matrix") |
|
|
58 |
parser.add_argument("--cat", required=False, help="Should be y_train matrix normalized by categories? If yes, category should be provided as cat argument, e.g. --cat T ") |
|
|
59 |
# if you only want to train the model, you don't need a validation set |
|
|
60 |
parser.add_argument("--Xval", default=None,required=False, help="validation Predictor") |
|
|
61 |
parser.add_argument("--yval",default=None, required=False,help= "validation response matrix") |
|
|
62 |
parser.add_argument("--scale",help="Boolean indicating if y should be scaled", action="store_true") |
|
|
63 |
parser.add_argument("--dummy", help="Convert SNPs into OneHot encoding", action="store_true") |
|
|
64 |
parser.add_argument("--categorical", help="Are outputs categorical variables?", action="store_true") |
|
|
65 |
parser.add_argument("--gridProp", help="proportion of random search", default=1, type=float) |
|
|
66 |
parser.add_argument("--trait", help="name of trait to be used in the analysis", default=None,required=False) |
|
|
67 |
parser.add_argument("--output", help="path to output dir",required=True) |
|
|
68 |
Method = parser.add_mutually_exclusive_group() |
|
|
69 |
Method.add_argument('--mlp', action='store_true') |
|
|
70 |
Method.add_argument('--cnn', action='store_true') |
|
|
71 |
Method.add_argument('--lasso', action='store_true') |
|
|
72 |
Method.add_argument('--ridge', action='store_true') |
|
|
73 |
parser.add_argument('--lr',type=float, nargs='+', help='the learning rate parameter to be used in NN configuration. It can be a list to tune', required=False,default=0.025) |
|
|
74 |
parser.add_argument('--dr1',type=float, nargs='+', help='the dropout to be used in the first layer of NN configuration. It can be a list to tune', required=False, default=0) |
|
|
75 |
parser.add_argument('--dr2',type=float, nargs='+', help='the dropout to be used in the hidden_layers of the NN configuration. It can be a list to tune', required=False, default=0) |
|
|
76 |
parser.add_argument('--reg1',type=float, nargs='+', help='the L2 regularization to be used in the first layer of NN configuration. It can be a list to tune', required=False,default=0) |
|
|
77 |
parser.add_argument('--nconv',type=int, nargs='+',help='number of convolutions layers to be considered in convolutional operation. It only works if Method is CNN and it can be a list to tune', default=1) |
|
|
78 |
parser.add_argument('--act1', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act1 relu elu, --act1 linear") |
|
|
79 |
parser.add_argument('--act2', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act2 relu elu, --act2 linear") |
|
|
80 |
parser.add_argument('--reg2',type=float, nargs='+', help='the L2 regularization to be used in the hidden_layers of NN configuration. It can be a list to tune', required=False,default=0) |
|
|
81 |
parser.add_argument('--reg3',type=float, nargs='+', help='the L2 regularization to be used in the output layers of the NN configuration. It can be a list to tune', required=False,default=0) |
|
|
82 |
parser.add_argument('--hn',type=int, nargs='+', help='Number of hidden neurons to be used on the NN configuration. It can be a list to tune', required=False, default=8) |
|
|
83 |
parser.add_argument('--ps',type=int, nargs='+', help='Pool size to be used on the CNN configuration. It can be a list to tune', required=False,default=3) |
|
|
84 |
parser.add_argument('--hl',type=int, nargs='+', help='Number of hidden layers to be used in the NN configuration. It can be a list to tune', required=False,default=1) |
|
|
85 |
parser.add_argument('--ks',type=int, nargs='+', help='kernel_size to be used in CNN configuration. It can be a list to tune', required=False,default=3) |
|
|
86 |
parser.add_argument('--ns',type=int, nargs='+', help='stride to be used in CNN configuration. It can be a list to tune', required=False,default=1) |
|
|
87 |
parser.add_argument('--epochs',type=int, nargs='+', help='nepochs to be used for training. It can be a list to tune', required=False,default=50) |
|
|
88 |
parser.add_argument('--nfilters',type=int, nargs='+', help='number of filter (#convolutions in each convolutional layer). It can be a list to tune', required=False,default=8) |
|
|
89 |
parser.add_argument('--optimizer', action='store',type=str, nargs='+', default=['Adam', 'Nadam', 'sgd'],help="Examples: -optimizer Adam") |
|
|
90 |
parser.add_argument('--bs',type=int, nargs='+', help='batch size to be used in NN configuration. It can be a list to tune', required=False,default=16) |
|
|
91 |
parser.add_argument('--N_neurons_FL',type=int, nargs='+', help='NNeurons to be used at the first layer of MLP configuration. It can be a list to tune', required=False,default=8) |
|
|
92 |
|
|
|
93 |
args = parser.parse_args() |
|
|
94 |
|
|
|
95 |
if not isinstance(args.hn, list): |
|
|
96 |
args.hn = [args.hn] |
|
|
97 |
if not isinstance(args.nconv, list): |
|
|
98 |
args.nconv = [args.nconv] |
|
|
99 |
if not isinstance(args.ps, list): |
|
|
100 |
args.ps = [args.ps] |
|
|
101 |
if not isinstance(args.hl, list): |
|
|
102 |
args.hl = [args.hl] |
|
|
103 |
if not isinstance(args.ks, list): |
|
|
104 |
args.ks = [args.ks] |
|
|
105 |
if not isinstance(args.ns, list): |
|
|
106 |
args.ns = [args.ns] |
|
|
107 |
if not isinstance(args.epochs, list): |
|
|
108 |
args.epochs = [args.epochs] |
|
|
109 |
if not isinstance(args.nfilters, list): |
|
|
110 |
args.nfilters = [args.nfilters] |
|
|
111 |
if not isinstance(args.bs, list): |
|
|
112 |
args.bs = [args.bs] |
|
|
113 |
if not isinstance(args.N_neurons_FL, list): |
|
|
114 |
args.N_neurons_FL = [args.N_neurons_FL] |
|
|
115 |
if not isinstance(args.reg1, list): |
|
|
116 |
args.reg1 = [args.reg1] |
|
|
117 |
if not isinstance(args.reg2, list): |
|
|
118 |
args.reg2 = [args.reg2] |
|
|
119 |
if not isinstance(args.reg3, list): |
|
|
120 |
args.reg3 = [args.reg3] |
|
|
121 |
if not isinstance(args.dr1, list): |
|
|
122 |
args.dr1 = [args.dr1] |
|
|
123 |
if not isinstance(args.dr2, list): |
|
|
124 |
args.dr2 = [args.dr2] |
|
|
125 |
if not isinstance(args.lr, list): |
|
|
126 |
args.lr = [args.lr] |
|
|
127 |
|
|
|
128 |
if args.dummy: |
|
|
129 |
if args.Xval is not None: |
|
|
130 |
Xtr=pd.read_csv(args.Xtr,sep='\s+') |
|
|
131 |
Xval=pd.read_csv(args.Xval,sep='\s+') |
|
|
132 |
All_X = pd.concat([Xtr,Xval]) |
|
|
133 |
All_X = All_X.round(decimals=0) |
|
|
134 |
#All_X = All_X.apply(pd.to_numeric) |
|
|
135 |
le = OneHotEncoder(sparse=False) |
|
|
136 |
label_encoder = LabelEncoder() |
|
|
137 |
X_enco = All_X.apply(label_encoder.fit_transform) |
|
|
138 |
onehot_encoder = OneHotEncoder(sparse=False) |
|
|
139 |
All_X_oh = onehot_encoder.fit_transform(X_enco) |
|
|
140 |
X_tr = All_X_oh[0:Xtr.shape[0], :] |
|
|
141 |
X_vl = All_X_oh[Xtr.shape[0]:All_X_oh.shape[0], ] |
|
|
142 |
else: |
|
|
143 |
X_tr = pd.read_csv(args.Xtr,sep='\s+').round(decimals=0) |
|
|
144 |
#X_tr = X_tr.apply(pd.to_numeric) |
|
|
145 |
le = OneHotEncoder(sparse=False) |
|
|
146 |
label_encoder = LabelEncoder() |
|
|
147 |
X_enco = X_tr.apply(label_encoder.fit_transform) |
|
|
148 |
onehot_encoder = OneHotEncoder(sparse=False) |
|
|
149 |
X_tr = onehot_encoder.fit_transform(X_enco) |
|
|
150 |
y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric) |
|
|
151 |
X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2) |
|
|
152 |
else: |
|
|
153 |
if args.Xval is not None: |
|
|
154 |
X_tr= pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric) |
|
|
155 |
X_vl= pd.read_csv(args.Xval,sep='\s+').apply(pd.to_numeric) |
|
|
156 |
else: |
|
|
157 |
X_tr=pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric) |
|
|
158 |
y_tr=pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric) |
|
|
159 |
X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2) |
|
|
160 |
if args.categorical is False: |
|
|
161 |
if args.scale is True: |
|
|
162 |
if args.cat is not None: |
|
|
163 |
if args.yval is not None: |
|
|
164 |
y_tr = pd.read_csv(args.ytr,sep='\s+') |
|
|
165 |
y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric) |
|
|
166 |
cat = args.cat |
|
|
167 |
loc = y_tr.columns.get_loc(cat) |
|
|
168 |
Factors=np.unique(y_tr.iloc[:, loc]) |
|
|
169 |
scaled=pd.DataFrame([]) |
|
|
170 |
for i in Factors: |
|
|
171 |
M = y_tr.iloc[np.where(y_tr.iloc[:, loc] == i)] |
|
|
172 |
M = M.iloc[:, :-1].apply(scale) |
|
|
173 |
scaled=scaled.append(M) |
|
|
174 |
|
|
|
175 |
y_val = y_val.apply(scale) |
|
|
176 |
scaled.columns=y_tr.columns[:-1] |
|
|
177 |
y_val.columns=y_tr.columns[:-1] |
|
|
178 |
y_tr = scaled |
|
|
179 |
|
|
|
180 |
else: |
|
|
181 |
y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric) |
|
|
182 |
y_tr = y_tr.apply(scale) |
|
|
183 |
y_val = y_val.apply(scale) |
|
|
184 |
else: |
|
|
185 |
if args.yval is not None: |
|
|
186 |
y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric) |
|
|
187 |
y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric) |
|
|
188 |
y_val = y_val.apply(scale) |
|
|
189 |
y_tr = y_tr.apply(scale) |
|
|
190 |
else: |
|
|
191 |
y_tr = pd.read_csv(args.ytr).apply(pd.to_numeric) |
|
|
192 |
y_tr = y_tr.apply(scale) |
|
|
193 |
y_val = y_val.apply(scale) |
|
|
194 |
else: |
|
|
195 |
if args.yval is not None: |
|
|
196 |
y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric) |
|
|
197 |
y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric) |
|
|
198 |
else: |
|
|
199 |
y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric) |
|
|
200 |
|
|
|
201 |
if args.trait is not None: |
|
|
202 |
trait= args.trait |
|
|
203 |
loc=y_tr.columns.get_loc(trait) |
|
|
204 |
y_tr=y_tr.iloc[:,loc] |
|
|
205 |
y_val=y_val.iloc[:,loc] |
|
|
206 |
|
|
|
207 |
else: |
|
|
208 |
trait=None |
|
|
209 |
y_tr = np.asarray(y_tr) |
|
|
210 |
y_val = np.asarray(y_val) |
|
|
211 |
else: |
|
|
212 |
if args.yval is not None: |
|
|
213 |
y_tr = pd.read_csv(args.ytr,sep='\s+') |
|
|
214 |
y_val = pd.read_csv(args.yval,sep='\s+') |
|
|
215 |
else: |
|
|
216 |
y_tr = pd.read_csv(args.ytr,sep='\s+') |
|
|
217 |
if args.trait is not None: |
|
|
218 |
trait = args.trait |
|
|
219 |
loc = y_tr.columns.get_loc(trait) |
|
|
220 |
y_tr = y_tr.iloc[:, loc] |
|
|
221 |
y_val = y_val.iloc[:, loc] |
|
|
222 |
y_tr = np.asarray(y_tr) |
|
|
223 |
y_val = np.asarray(y_val) |
|
|
224 |
encoder = LabelEncoder() |
|
|
225 |
encoder.fit(y_tr) |
|
|
226 |
y_tr = encoder.transform(y_tr) |
|
|
227 |
y_tr = to_categorical(y_tr) |
|
|
228 |
encoder = LabelEncoder() |
|
|
229 |
encoder.fit(y_val) |
|
|
230 |
y_val = encoder.transform(y_val) |
|
|
231 |
y_val=to_categorical(y_val) |
|
|
232 |
else: |
|
|
233 |
if len(y_tr.columns) > 1: |
|
|
234 |
sys.exit("To categorical Values,Only one trait is allowed ") |
|
|
235 |
else: |
|
|
236 |
y_tr = np.asarray(y_tr) |
|
|
237 |
y_val = np.asarray(y_val) |
|
|
238 |
encoder = LabelEncoder() |
|
|
239 |
encoder.fit(y_tr) |
|
|
240 |
y_tr = encoder.transform(y_tr) |
|
|
241 |
y_tr = to_categorical(y_tr) |
|
|
242 |
encoder = LabelEncoder() |
|
|
243 |
encoder.fit(y_val) |
|
|
244 |
y_val = encoder.transform(y_val) |
|
|
245 |
y_val = to_categorical(y_val) |
|
|
246 |
|
|
|
247 |
X_vl=np.asarray(X_vl) |
|
|
248 |
X_tr=np.asarray(X_tr) |
|
|
249 |
|
|
|
250 |
main=os.getcwd() |
|
|
251 |
diro=args.output |
|
|
252 |
|
|
|
253 |
###to cat |
|
|
254 |
|
|
|
255 |
|
|
|
256 |
if args.cnn: |
|
|
257 |
if trait is not None: |
|
|
258 |
run_cnn_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs) |
|
|
259 |
else: |
|
|
260 |
run_cnn_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs) |
|
|
261 |
if args.mlp: |
|
|
262 |
if trait is not None: |
|
|
263 |
run_mlp_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_neurons_FL) |
|
|
264 |
else: |
|
|
265 |
run_mlp_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_Neurons_FL) |
|
|
266 |
if args.ridge: |
|
|
267 |
if args.categorical is True: |
|
|
268 |
sys.exit("ridge only allowing to continous outputs") |
|
|
269 |
else: |
|
|
270 |
run_ridge_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main) |
|
|
271 |
if args.lasso: |
|
|
272 |
|
|
|
273 |
if args.categorical is True: |
|
|
274 |
sys.exit("ridge only allowing to continous outputs") |
|
|
275 |
else: |
|
|
276 |
run_lasso_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main) |