Diff of /GP/Deep_genomic.py [000000] .. [5bc2f8]

Switch to unified view

a b/GP/Deep_genomic.py
1
"""
2
 Deep_genomic.py allows predicting complex traits by using Deep Learning and Penalized Liner models
3
 Authors: Laura M Zingaretti (m.lau.zingaretti@gmail.com) and Miguel Perez-Enciso (miguel.perez@uab.es)
4
"""
5
'''
6
    Copyright (C) 2019  Laura Zingaretti
7
    This program is free software: you can redistribute it and/or modify
8
    it under the terms of the GNU General Public License as published by
9
    the Free Software Foundation, either version 3 of the License, or
10
    (at your option) any later version.
11
    This program is distributed in the hope that it will be useful,
12
    but WITHOUT ANY WARRANTY; without even the implied warranty of
13
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14
    GNU General Public License for more details.
15
    You should have received a copy of the GNU General Public License
16
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
17
'''
18
19
import matplotlib as mpl
20
mpl.use('Agg')
21
from CNN.cnn import acc_pearson_r  as acc_pearson_r
22
import tensorflow as tf
23
from keras.backend.tensorflow_backend import set_session
24
import pandas as pd
25
import numpy as np
26
from run_cnn import run_cnn_main
27
from run_mlp import run_mlp_main
28
from run_ridge import run_ridge_main
29
from run_lasso import run_lasso_main
30
#from run_rnn import run_rnn_main
31
import seaborn as sns
32
from sklearn.model_selection import train_test_split
33
34
35
from scipy import stats
36
from sklearn.decomposition import PCA
37
from sklearn.preprocessing import StandardScaler
38
from sklearn.preprocessing import scale
39
40
41
from sklearn.preprocessing import LabelEncoder
42
from sklearn.preprocessing import OneHotEncoder
43
from keras.utils import to_categorical
44
45
46
import os
47
48
import sys
49
import argparse
50
import logging
51
52
53
if __name__ == '__main__':
54
55
    parser = argparse.ArgumentParser()
56
    parser.add_argument("--Xtr", required=True, help="train Predictor matrix")
57
    parser.add_argument("--ytr", required=True,help= "train response matrix")
58
    parser.add_argument("--cat", required=False, help="Should be y_train matrix normalized by categories? If yes, category should be provided as cat argument, e.g. --cat T ")
59
    # if you only want to train the model, you don't need a validation set
60
    parser.add_argument("--Xval", default=None,required=False, help="validation Predictor")
61
    parser.add_argument("--yval",default=None, required=False,help= "validation response matrix")
62
    parser.add_argument("--scale",help="Boolean indicating if y should be scaled", action="store_true")
63
    parser.add_argument("--dummy", help="Convert SNPs into OneHot encoding", action="store_true")
64
    parser.add_argument("--categorical", help="Are outputs categorical variables?", action="store_true")
65
    parser.add_argument("--gridProp", help="proportion of random search", default=1, type=float)
66
    parser.add_argument("--trait", help="name of trait to be used in the analysis", default=None,required=False)
67
    parser.add_argument("--output", help="path to output dir",required=True)
68
    Method = parser.add_mutually_exclusive_group()
69
    Method.add_argument('--mlp', action='store_true')
70
    Method.add_argument('--cnn', action='store_true')
71
    Method.add_argument('--lasso', action='store_true')
72
    Method.add_argument('--ridge', action='store_true')
73
    parser.add_argument('--lr',type=float, nargs='+', help='the learning rate parameter to be used in NN configuration. It can be a list to tune', required=False,default=0.025)
74
    parser.add_argument('--dr1',type=float, nargs='+', help='the dropout to be used in  the first layer of  NN configuration. It can be a list to tune', required=False, default=0)
75
    parser.add_argument('--dr2',type=float, nargs='+', help='the dropout to be used in  the hidden_layers of the NN configuration. It can be a list to tune', required=False, default=0)
76
    parser.add_argument('--reg1',type=float, nargs='+', help='the L2 regularization to be used in  the first layer of  NN configuration. It can be a list to tune', required=False,default=0)
77
    parser.add_argument('--nconv',type=int, nargs='+',help='number of convolutions layers to be considered in convolutional operation. It only works if Method is CNN and it can be a list to tune', default=1)
78
    parser.add_argument('--act1', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act1 relu elu, --act1 linear")
79
    parser.add_argument('--act2', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act2 relu elu, --act2 linear")
80
    parser.add_argument('--reg2',type=float, nargs='+', help='the L2 regularization to be used in  the hidden_layers of  NN configuration. It can be a list to tune', required=False,default=0)
81
    parser.add_argument('--reg3',type=float, nargs='+', help='the L2 regularization to be used in  the output layers of the  NN configuration. It can be a list to tune', required=False,default=0)
82
    parser.add_argument('--hn',type=int, nargs='+', help='Number of hidden neurons to be used on the NN configuration. It can be a list to tune', required=False, default=8)
83
    parser.add_argument('--ps',type=int, nargs='+', help='Pool size to be used on the CNN configuration. It can be a list to tune', required=False,default=3)
84
    parser.add_argument('--hl',type=int, nargs='+', help='Number of hidden layers to be used in the NN configuration. It can be a list to tune', required=False,default=1)
85
    parser.add_argument('--ks',type=int, nargs='+', help='kernel_size to be used in CNN configuration. It can be a list to tune', required=False,default=3)
86
    parser.add_argument('--ns',type=int, nargs='+', help='stride to be used in CNN configuration. It can be a list to tune', required=False,default=1)
87
    parser.add_argument('--epochs',type=int, nargs='+', help='nepochs to be used for training. It can be a list to tune', required=False,default=50)
88
    parser.add_argument('--nfilters',type=int, nargs='+', help='number of filter (#convolutions in each convolutional layer). It can be a list to tune', required=False,default=8)
89
    parser.add_argument('--optimizer', action='store',type=str, nargs='+', default=['Adam', 'Nadam', 'sgd'],help="Examples: -optimizer Adam")
90
    parser.add_argument('--bs',type=int, nargs='+', help='batch size to be used in NN configuration. It can be a list to tune', required=False,default=16)
91
    parser.add_argument('--N_neurons_FL',type=int, nargs='+', help='NNeurons to be used at the first layer of MLP configuration. It can be a list to tune', required=False,default=8)
92
93
    args = parser.parse_args()
94
95
    if not isinstance(args.hn, list):
96
           args.hn = [args.hn]
97
    if not isinstance(args.nconv, list):
98
           args.nconv = [args.nconv]
99
    if not isinstance(args.ps, list):
100
           args.ps = [args.ps]
101
    if not isinstance(args.hl, list):
102
           args.hl = [args.hl]
103
    if not isinstance(args.ks, list):
104
           args.ks = [args.ks]
105
    if not isinstance(args.ns, list):
106
           args.ns = [args.ns]
107
    if not isinstance(args.epochs, list):
108
           args.epochs = [args.epochs]
109
    if not isinstance(args.nfilters, list):
110
           args.nfilters = [args.nfilters]
111
    if not isinstance(args.bs, list):
112
           args.bs = [args.bs]
113
    if not isinstance(args.N_neurons_FL, list):
114
        args.N_neurons_FL = [args.N_neurons_FL]
115
    if not isinstance(args.reg1, list):
116
           args.reg1 = [args.reg1]
117
    if not isinstance(args.reg2, list):
118
           args.reg2 = [args.reg2]
119
    if not isinstance(args.reg3, list):
120
           args.reg3 = [args.reg3]
121
    if not isinstance(args.dr1, list):
122
           args.dr1 = [args.dr1]
123
    if not isinstance(args.dr2, list):
124
           args.dr2 = [args.dr2]
125
    if not isinstance(args.lr, list):
126
           args.lr = [args.lr]
127
128
    if args.dummy:
129
        if args.Xval is not None:
130
            Xtr=pd.read_csv(args.Xtr,sep='\s+')
131
            Xval=pd.read_csv(args.Xval,sep='\s+')
132
            All_X = pd.concat([Xtr,Xval])
133
            All_X = All_X.round(decimals=0)
134
            #All_X = All_X.apply(pd.to_numeric)
135
            le = OneHotEncoder(sparse=False)
136
            label_encoder = LabelEncoder()
137
            X_enco = All_X.apply(label_encoder.fit_transform)
138
            onehot_encoder = OneHotEncoder(sparse=False)
139
            All_X_oh = onehot_encoder.fit_transform(X_enco)
140
            X_tr = All_X_oh[0:Xtr.shape[0], :]
141
            X_vl = All_X_oh[Xtr.shape[0]:All_X_oh.shape[0], ]
142
        else:
143
            X_tr = pd.read_csv(args.Xtr,sep='\s+').round(decimals=0)
144
            #X_tr = X_tr.apply(pd.to_numeric)
145
            le = OneHotEncoder(sparse=False)
146
            label_encoder = LabelEncoder()
147
            X_enco = X_tr.apply(label_encoder.fit_transform)
148
            onehot_encoder = OneHotEncoder(sparse=False)
149
            X_tr = onehot_encoder.fit_transform(X_enco)
150
            y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
151
            X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2)
152
    else:
153
        if args.Xval is not None:
154
            X_tr= pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric)
155
            X_vl= pd.read_csv(args.Xval,sep='\s+').apply(pd.to_numeric)
156
        else:
157
            X_tr=pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric)
158
            y_tr=pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
159
            X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2)
160
    if args.categorical is False:
161
        if args.scale is True:
162
            if args.cat is not None:
163
                if args.yval is not None:
164
                    y_tr = pd.read_csv(args.ytr,sep='\s+')
165
                    y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
166
                    cat = args.cat
167
                    loc = y_tr.columns.get_loc(cat)
168
                    Factors=np.unique(y_tr.iloc[:, loc])
169
                    scaled=pd.DataFrame([])
170
                    for i in Factors:
171
                        M = y_tr.iloc[np.where(y_tr.iloc[:, loc] == i)]
172
                        M = M.iloc[:, :-1].apply(scale)
173
                        scaled=scaled.append(M)
174
175
                    y_val = y_val.apply(scale)
176
                    scaled.columns=y_tr.columns[:-1]
177
                    y_val.columns=y_tr.columns[:-1]
178
                    y_tr = scaled
179
180
                else:
181
                    y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
182
                    y_tr = y_tr.apply(scale)
183
                    y_val = y_val.apply(scale)
184
            else:
185
                if args.yval is not None:
186
                    y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
187
                    y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
188
                    y_val = y_val.apply(scale)
189
                    y_tr = y_tr.apply(scale)
190
                else:
191
                    y_tr = pd.read_csv(args.ytr).apply(pd.to_numeric)
192
                    y_tr = y_tr.apply(scale)
193
                    y_val = y_val.apply(scale)
194
        else:
195
            if args.yval is not None:
196
                y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
197
                y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
198
            else:
199
                y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
200
201
        if args.trait is not None:
202
            trait= args.trait
203
            loc=y_tr.columns.get_loc(trait)
204
            y_tr=y_tr.iloc[:,loc]
205
            y_val=y_val.iloc[:,loc]
206
207
        else:
208
            trait=None
209
        y_tr = np.asarray(y_tr)
210
        y_val = np.asarray(y_val)
211
    else:
212
        if args.yval is not None:
213
            y_tr = pd.read_csv(args.ytr,sep='\s+')
214
            y_val = pd.read_csv(args.yval,sep='\s+')
215
        else:
216
            y_tr = pd.read_csv(args.ytr,sep='\s+')
217
        if args.trait is not None:
218
            trait = args.trait
219
            loc = y_tr.columns.get_loc(trait)
220
            y_tr = y_tr.iloc[:, loc]
221
            y_val = y_val.iloc[:, loc]
222
            y_tr = np.asarray(y_tr)
223
            y_val = np.asarray(y_val)
224
            encoder = LabelEncoder()
225
            encoder.fit(y_tr)
226
            y_tr = encoder.transform(y_tr)
227
            y_tr = to_categorical(y_tr)
228
            encoder = LabelEncoder()
229
            encoder.fit(y_val)
230
            y_val = encoder.transform(y_val)
231
            y_val=to_categorical(y_val)
232
        else:
233
            if len(y_tr.columns) > 1:
234
                sys.exit("To categorical Values,Only one trait is allowed ")
235
            else:
236
                y_tr = np.asarray(y_tr)
237
                y_val = np.asarray(y_val)
238
                encoder = LabelEncoder()
239
                encoder.fit(y_tr)
240
                y_tr = encoder.transform(y_tr)
241
                y_tr = to_categorical(y_tr)
242
                encoder = LabelEncoder()
243
                encoder.fit(y_val)
244
                y_val = encoder.transform(y_val)
245
                y_val = to_categorical(y_val)
246
247
    X_vl=np.asarray(X_vl)
248
    X_tr=np.asarray(X_tr)
249
250
    main=os.getcwd()
251
    diro=args.output
252
253
###to cat
254
255
256
    if args.cnn:
257
        if trait is not None:
258
            run_cnn_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs)
259
        else:
260
            run_cnn_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs)
261
    if args.mlp:
262
        if trait is not None:
263
            run_mlp_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_neurons_FL)
264
        else:
265
            run_mlp_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_Neurons_FL)
266
    if args.ridge:
267
        if args.categorical is True:
268
            sys.exit("ridge only allowing to continous outputs")
269
        else:
270
            run_ridge_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main)
271
    if args.lasso:
272
273
        if args.categorical is True:
274
            sys.exit("ridge only allowing to continous outputs")
275
        else:
276
            run_lasso_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main)