Diff of /GP/Deep_genomic.py [000000] .. [5bc2f8]

Switch to side-by-side view

--- a
+++ b/GP/Deep_genomic.py
@@ -0,0 +1,276 @@
+"""
+ Deep_genomic.py allows predicting complex traits by using Deep Learning and Penalized Liner models
+ Authors: Laura M Zingaretti (m.lau.zingaretti@gmail.com) and Miguel Perez-Enciso (miguel.perez@uab.es)
+"""
+'''
+    Copyright (C) 2019  Laura Zingaretti
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+'''
+
+import matplotlib as mpl
+mpl.use('Agg')
+from CNN.cnn import acc_pearson_r  as acc_pearson_r
+import tensorflow as tf
+from keras.backend.tensorflow_backend import set_session
+import pandas as pd
+import numpy as np
+from run_cnn import run_cnn_main
+from run_mlp import run_mlp_main
+from run_ridge import run_ridge_main
+from run_lasso import run_lasso_main
+#from run_rnn import run_rnn_main
+import seaborn as sns
+from sklearn.model_selection import train_test_split
+
+
+from scipy import stats
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from sklearn.preprocessing import scale
+
+
+from sklearn.preprocessing import LabelEncoder
+from sklearn.preprocessing import OneHotEncoder
+from keras.utils import to_categorical
+
+
+import os
+
+import sys
+import argparse
+import logging
+
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--Xtr", required=True, help="train Predictor matrix")
+    parser.add_argument("--ytr", required=True,help= "train response matrix")
+    parser.add_argument("--cat", required=False, help="Should be y_train matrix normalized by categories? If yes, category should be provided as cat argument, e.g. --cat T ")
+    # if you only want to train the model, you don't need a validation set
+    parser.add_argument("--Xval", default=None,required=False, help="validation Predictor")
+    parser.add_argument("--yval",default=None, required=False,help= "validation response matrix")
+    parser.add_argument("--scale",help="Boolean indicating if y should be scaled", action="store_true")
+    parser.add_argument("--dummy", help="Convert SNPs into OneHot encoding", action="store_true")
+    parser.add_argument("--categorical", help="Are outputs categorical variables?", action="store_true")
+    parser.add_argument("--gridProp", help="proportion of random search", default=1, type=float)
+    parser.add_argument("--trait", help="name of trait to be used in the analysis", default=None,required=False)
+    parser.add_argument("--output", help="path to output dir",required=True)
+    Method = parser.add_mutually_exclusive_group()
+    Method.add_argument('--mlp', action='store_true')
+    Method.add_argument('--cnn', action='store_true')
+    Method.add_argument('--lasso', action='store_true')
+    Method.add_argument('--ridge', action='store_true')
+    parser.add_argument('--lr',type=float, nargs='+', help='the learning rate parameter to be used in NN configuration. It can be a list to tune', required=False,default=0.025)
+    parser.add_argument('--dr1',type=float, nargs='+', help='the dropout to be used in  the first layer of  NN configuration. It can be a list to tune', required=False, default=0)
+    parser.add_argument('--dr2',type=float, nargs='+', help='the dropout to be used in  the hidden_layers of the NN configuration. It can be a list to tune', required=False, default=0)
+    parser.add_argument('--reg1',type=float, nargs='+', help='the L2 regularization to be used in  the first layer of  NN configuration. It can be a list to tune', required=False,default=0)
+    parser.add_argument('--nconv',type=int, nargs='+',help='number of convolutions layers to be considered in convolutional operation. It only works if Method is CNN and it can be a list to tune', default=1)
+    parser.add_argument('--act1', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act1 relu elu, --act1 linear")
+    parser.add_argument('--act2', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act2 relu elu, --act2 linear")
+    parser.add_argument('--reg2',type=float, nargs='+', help='the L2 regularization to be used in  the hidden_layers of  NN configuration. It can be a list to tune', required=False,default=0)
+    parser.add_argument('--reg3',type=float, nargs='+', help='the L2 regularization to be used in  the output layers of the  NN configuration. It can be a list to tune', required=False,default=0)
+    parser.add_argument('--hn',type=int, nargs='+', help='Number of hidden neurons to be used on the NN configuration. It can be a list to tune', required=False, default=8)
+    parser.add_argument('--ps',type=int, nargs='+', help='Pool size to be used on the CNN configuration. It can be a list to tune', required=False,default=3)
+    parser.add_argument('--hl',type=int, nargs='+', help='Number of hidden layers to be used in the NN configuration. It can be a list to tune', required=False,default=1)
+    parser.add_argument('--ks',type=int, nargs='+', help='kernel_size to be used in CNN configuration. It can be a list to tune', required=False,default=3)
+    parser.add_argument('--ns',type=int, nargs='+', help='stride to be used in CNN configuration. It can be a list to tune', required=False,default=1)
+    parser.add_argument('--epochs',type=int, nargs='+', help='nepochs to be used for training. It can be a list to tune', required=False,default=50)
+    parser.add_argument('--nfilters',type=int, nargs='+', help='number of filter (#convolutions in each convolutional layer). It can be a list to tune', required=False,default=8)
+    parser.add_argument('--optimizer', action='store',type=str, nargs='+', default=['Adam', 'Nadam', 'sgd'],help="Examples: -optimizer Adam")
+    parser.add_argument('--bs',type=int, nargs='+', help='batch size to be used in NN configuration. It can be a list to tune', required=False,default=16)
+    parser.add_argument('--N_neurons_FL',type=int, nargs='+', help='NNeurons to be used at the first layer of MLP configuration. It can be a list to tune', required=False,default=8)
+
+    args = parser.parse_args()
+
+    if not isinstance(args.hn, list):
+	       args.hn = [args.hn]
+    if not isinstance(args.nconv, list):
+	       args.nconv = [args.nconv]
+    if not isinstance(args.ps, list):
+	       args.ps = [args.ps]
+    if not isinstance(args.hl, list):
+	       args.hl = [args.hl]
+    if not isinstance(args.ks, list):
+	       args.ks = [args.ks]
+    if not isinstance(args.ns, list):
+	       args.ns = [args.ns]
+    if not isinstance(args.epochs, list):
+	       args.epochs = [args.epochs]
+    if not isinstance(args.nfilters, list):
+    	   args.nfilters = [args.nfilters]
+    if not isinstance(args.bs, list):
+	       args.bs = [args.bs]
+    if not isinstance(args.N_neurons_FL, list):
+        args.N_neurons_FL = [args.N_neurons_FL]
+    if not isinstance(args.reg1, list):
+	       args.reg1 = [args.reg1]
+    if not isinstance(args.reg2, list):
+	       args.reg2 = [args.reg2]
+    if not isinstance(args.reg3, list):
+	       args.reg3 = [args.reg3]
+    if not isinstance(args.dr1, list):
+	       args.dr1 = [args.dr1]
+    if not isinstance(args.dr2, list):
+	       args.dr2 = [args.dr2]
+    if not isinstance(args.lr, list):
+	       args.lr = [args.lr]
+
+    if args.dummy:
+        if args.Xval is not None:
+            Xtr=pd.read_csv(args.Xtr,sep='\s+')
+            Xval=pd.read_csv(args.Xval,sep='\s+')
+            All_X = pd.concat([Xtr,Xval])
+            All_X = All_X.round(decimals=0)
+            #All_X = All_X.apply(pd.to_numeric)
+            le = OneHotEncoder(sparse=False)
+            label_encoder = LabelEncoder()
+            X_enco = All_X.apply(label_encoder.fit_transform)
+            onehot_encoder = OneHotEncoder(sparse=False)
+            All_X_oh = onehot_encoder.fit_transform(X_enco)
+            X_tr = All_X_oh[0:Xtr.shape[0], :]
+            X_vl = All_X_oh[Xtr.shape[0]:All_X_oh.shape[0], ]
+        else:
+            X_tr = pd.read_csv(args.Xtr,sep='\s+').round(decimals=0)
+            #X_tr = X_tr.apply(pd.to_numeric)
+            le = OneHotEncoder(sparse=False)
+            label_encoder = LabelEncoder()
+            X_enco = X_tr.apply(label_encoder.fit_transform)
+            onehot_encoder = OneHotEncoder(sparse=False)
+            X_tr = onehot_encoder.fit_transform(X_enco)
+            y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
+            X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2)
+    else:
+        if args.Xval is not None:
+            X_tr= pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric)
+            X_vl= pd.read_csv(args.Xval,sep='\s+').apply(pd.to_numeric)
+        else:
+            X_tr=pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric)
+            y_tr=pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
+            X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2)
+    if args.categorical is False:
+        if args.scale is True:
+            if args.cat is not None:
+                if args.yval is not None:
+                    y_tr = pd.read_csv(args.ytr,sep='\s+')
+                    y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
+                    cat = args.cat
+                    loc = y_tr.columns.get_loc(cat)
+                    Factors=np.unique(y_tr.iloc[:, loc])
+                    scaled=pd.DataFrame([])
+                    for i in Factors:
+                        M = y_tr.iloc[np.where(y_tr.iloc[:, loc] == i)]
+                        M = M.iloc[:, :-1].apply(scale)
+                        scaled=scaled.append(M)
+
+                    y_val = y_val.apply(scale)
+                    scaled.columns=y_tr.columns[:-1]
+                    y_val.columns=y_tr.columns[:-1]
+                    y_tr = scaled
+
+                else:
+                    y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
+                    y_tr = y_tr.apply(scale)
+                    y_val = y_val.apply(scale)
+            else:
+                if args.yval is not None:
+                    y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
+                    y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
+                    y_val = y_val.apply(scale)
+                    y_tr = y_tr.apply(scale)
+                else:
+                    y_tr = pd.read_csv(args.ytr).apply(pd.to_numeric)
+                    y_tr = y_tr.apply(scale)
+                    y_val = y_val.apply(scale)
+        else:
+            if args.yval is not None:
+                y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
+                y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
+            else:
+                y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
+
+        if args.trait is not None:
+            trait= args.trait
+            loc=y_tr.columns.get_loc(trait)
+            y_tr=y_tr.iloc[:,loc]
+            y_val=y_val.iloc[:,loc]
+
+        else:
+            trait=None
+        y_tr = np.asarray(y_tr)
+        y_val = np.asarray(y_val)
+    else:
+        if args.yval is not None:
+            y_tr = pd.read_csv(args.ytr,sep='\s+')
+            y_val = pd.read_csv(args.yval,sep='\s+')
+        else:
+            y_tr = pd.read_csv(args.ytr,sep='\s+')
+        if args.trait is not None:
+            trait = args.trait
+            loc = y_tr.columns.get_loc(trait)
+            y_tr = y_tr.iloc[:, loc]
+            y_val = y_val.iloc[:, loc]
+            y_tr = np.asarray(y_tr)
+            y_val = np.asarray(y_val)
+            encoder = LabelEncoder()
+            encoder.fit(y_tr)
+            y_tr = encoder.transform(y_tr)
+            y_tr = to_categorical(y_tr)
+            encoder = LabelEncoder()
+            encoder.fit(y_val)
+            y_val = encoder.transform(y_val)
+            y_val=to_categorical(y_val)
+        else:
+            if len(y_tr.columns) > 1:
+                sys.exit("To categorical Values,Only one trait is allowed ")
+            else:
+                y_tr = np.asarray(y_tr)
+                y_val = np.asarray(y_val)
+                encoder = LabelEncoder()
+                encoder.fit(y_tr)
+                y_tr = encoder.transform(y_tr)
+                y_tr = to_categorical(y_tr)
+                encoder = LabelEncoder()
+                encoder.fit(y_val)
+                y_val = encoder.transform(y_val)
+                y_val = to_categorical(y_val)
+
+    X_vl=np.asarray(X_vl)
+    X_tr=np.asarray(X_tr)
+
+    main=os.getcwd()
+    diro=args.output
+
+###to cat
+
+
+    if args.cnn:
+        if trait is not None:
+            run_cnn_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs)
+        else:
+            run_cnn_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs)
+    if args.mlp:
+        if trait is not None:
+            run_mlp_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_neurons_FL)
+        else:
+            run_mlp_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_Neurons_FL)
+    if args.ridge:
+        if args.categorical is True:
+            sys.exit("ridge only allowing to continous outputs")
+        else:
+            run_ridge_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main)
+    if args.lasso:
+
+        if args.categorical is True:
+            sys.exit("ridge only allowing to continous outputs")
+        else:
+            run_lasso_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main)