DeepGP / Git / [977db3] /GP/Deep

Models:
MarcoTheBlack/
DeepGP
Downloads: 1
[977db3]: / GP / Deep_genomic.py
History
Download this file
277 lines (252 with data), 14.9 kB

"""
 Deep_genomic.py allows predicting complex traits by using Deep Learning and Penalized Liner models
 Authors: Laura M Zingaretti (m.lau.zingaretti@gmail.com) and Miguel Perez-Enciso (miguel.perez@uab.es)
"""
'''
    Copyright (C) 2019  Laura Zingaretti
    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
'''

import matplotlib as mpl
mpl.use('Agg')
from CNN.cnn import acc_pearson_r  as acc_pearson_r
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
import pandas as pd
import numpy as np
from run_cnn import run_cnn_main
from run_mlp import run_mlp_main
from run_ridge import run_ridge_main
from run_lasso import run_lasso_main
#from run_rnn import run_rnn_main
import seaborn as sns
from sklearn.model_selection import train_test_split


from scipy import stats
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale


from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical


import os

import sys
import argparse
import logging


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument("--Xtr", required=True, help="train Predictor matrix")
    parser.add_argument("--ytr", required=True,help= "train response matrix")
    parser.add_argument("--cat", required=False, help="Should be y_train matrix normalized by categories? If yes, category should be provided as cat argument, e.g. --cat T ")
    # if you only want to train the model, you don't need a validation set
    parser.add_argument("--Xval", default=None,required=False, help="validation Predictor")
    parser.add_argument("--yval",default=None, required=False,help= "validation response matrix")
    parser.add_argument("--scale",help="Boolean indicating if y should be scaled", action="store_true")
    parser.add_argument("--dummy", help="Convert SNPs into OneHot encoding", action="store_true")
    parser.add_argument("--categorical", help="Are outputs categorical variables?", action="store_true")
    parser.add_argument("--gridProp", help="proportion of random search", default=1, type=float)
    parser.add_argument("--trait", help="name of trait to be used in the analysis", default=None,required=False)
    parser.add_argument("--output", help="path to output dir",required=True)
    Method = parser.add_mutually_exclusive_group()
    Method.add_argument('--mlp', action='store_true')
    Method.add_argument('--cnn', action='store_true')
    Method.add_argument('--lasso', action='store_true')
    Method.add_argument('--ridge', action='store_true')
    parser.add_argument('--lr',type=float, nargs='+', help='the learning rate parameter to be used in NN configuration. It can be a list to tune', required=False,default=0.025)
    parser.add_argument('--dr1',type=float, nargs='+', help='the dropout to be used in  the first layer of  NN configuration. It can be a list to tune', required=False, default=0)
    parser.add_argument('--dr2',type=float, nargs='+', help='the dropout to be used in  the hidden_layers of the NN configuration. It can be a list to tune', required=False, default=0)
    parser.add_argument('--reg1',type=float, nargs='+', help='the L2 regularization to be used in  the first layer of  NN configuration. It can be a list to tune', required=False,default=0)
    parser.add_argument('--nconv',type=int, nargs='+',help='number of convolutions layers to be considered in convolutional operation. It only works if Method is CNN and it can be a list to tune', default=1)
    parser.add_argument('--act1', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act1 relu elu, --act1 linear")
    parser.add_argument('--act2', action='store',type=str, nargs='+', default=['relu', 'elu', 'linear', 'softmax', 'tanh', 'softplus'],help="Examples: --act2 relu elu, --act2 linear")
    parser.add_argument('--reg2',type=float, nargs='+', help='the L2 regularization to be used in  the hidden_layers of  NN configuration. It can be a list to tune', required=False,default=0)
    parser.add_argument('--reg3',type=float, nargs='+', help='the L2 regularization to be used in  the output layers of the  NN configuration. It can be a list to tune', required=False,default=0)
    parser.add_argument('--hn',type=int, nargs='+', help='Number of hidden neurons to be used on the NN configuration. It can be a list to tune', required=False, default=8)
    parser.add_argument('--ps',type=int, nargs='+', help='Pool size to be used on the CNN configuration. It can be a list to tune', required=False,default=3)
    parser.add_argument('--hl',type=int, nargs='+', help='Number of hidden layers to be used in the NN configuration. It can be a list to tune', required=False,default=1)
    parser.add_argument('--ks',type=int, nargs='+', help='kernel_size to be used in CNN configuration. It can be a list to tune', required=False,default=3)
    parser.add_argument('--ns',type=int, nargs='+', help='stride to be used in CNN configuration. It can be a list to tune', required=False,default=1)
    parser.add_argument('--epochs',type=int, nargs='+', help='nepochs to be used for training. It can be a list to tune', required=False,default=50)
    parser.add_argument('--nfilters',type=int, nargs='+', help='number of filter (#convolutions in each convolutional layer). It can be a list to tune', required=False,default=8)
    parser.add_argument('--optimizer', action='store',type=str, nargs='+', default=['Adam', 'Nadam', 'sgd'],help="Examples: -optimizer Adam")
    parser.add_argument('--bs',type=int, nargs='+', help='batch size to be used in NN configuration. It can be a list to tune', required=False,default=16)
    parser.add_argument('--N_neurons_FL',type=int, nargs='+', help='NNeurons to be used at the first layer of MLP configuration. It can be a list to tune', required=False,default=8)

    args = parser.parse_args()

    if not isinstance(args.hn, list):
	       args.hn = [args.hn]
    if not isinstance(args.nconv, list):
	       args.nconv = [args.nconv]
    if not isinstance(args.ps, list):
	       args.ps = [args.ps]
    if not isinstance(args.hl, list):
	       args.hl = [args.hl]
    if not isinstance(args.ks, list):
	       args.ks = [args.ks]
    if not isinstance(args.ns, list):
	       args.ns = [args.ns]
    if not isinstance(args.epochs, list):
	       args.epochs = [args.epochs]
    if not isinstance(args.nfilters, list):
    	   args.nfilters = [args.nfilters]
    if not isinstance(args.bs, list):
	       args.bs = [args.bs]
    if not isinstance(args.N_neurons_FL, list):
        args.N_neurons_FL = [args.N_neurons_FL]
    if not isinstance(args.reg1, list):
	       args.reg1 = [args.reg1]
    if not isinstance(args.reg2, list):
	       args.reg2 = [args.reg2]
    if not isinstance(args.reg3, list):
	       args.reg3 = [args.reg3]
    if not isinstance(args.dr1, list):
	       args.dr1 = [args.dr1]
    if not isinstance(args.dr2, list):
	       args.dr2 = [args.dr2]
    if not isinstance(args.lr, list):
	       args.lr = [args.lr]

    if args.dummy:
        if args.Xval is not None:
            Xtr=pd.read_csv(args.Xtr,sep='\s+')
            Xval=pd.read_csv(args.Xval,sep='\s+')
            All_X = pd.concat([Xtr,Xval])
            All_X = All_X.round(decimals=0)
            #All_X = All_X.apply(pd.to_numeric)
            le = OneHotEncoder(sparse=False)
            label_encoder = LabelEncoder()
            X_enco = All_X.apply(label_encoder.fit_transform)
            onehot_encoder = OneHotEncoder(sparse=False)
            All_X_oh = onehot_encoder.fit_transform(X_enco)
            X_tr = All_X_oh[0:Xtr.shape[0], :]
            X_vl = All_X_oh[Xtr.shape[0]:All_X_oh.shape[0], ]
        else:
            X_tr = pd.read_csv(args.Xtr,sep='\s+').round(decimals=0)
            #X_tr = X_tr.apply(pd.to_numeric)
            le = OneHotEncoder(sparse=False)
            label_encoder = LabelEncoder()
            X_enco = X_tr.apply(label_encoder.fit_transform)
            onehot_encoder = OneHotEncoder(sparse=False)
            X_tr = onehot_encoder.fit_transform(X_enco)
            y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
            X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2)
    else:
        if args.Xval is not None:
            X_tr= pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric)
            X_vl= pd.read_csv(args.Xval,sep='\s+').apply(pd.to_numeric)
        else:
            X_tr=pd.read_csv(args.Xtr,sep='\s+').apply(pd.to_numeric)
            y_tr=pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
            X_tr, X_vl, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2)
    if args.categorical is False:
        if args.scale is True:
            if args.cat is not None:
                if args.yval is not None:
                    y_tr = pd.read_csv(args.ytr,sep='\s+')
                    y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
                    cat = args.cat
                    loc = y_tr.columns.get_loc(cat)
                    Factors=np.unique(y_tr.iloc[:, loc])
                    scaled=pd.DataFrame([])
                    for i in Factors:
                        M = y_tr.iloc[np.where(y_tr.iloc[:, loc] == i)]
                        M = M.iloc[:, :-1].apply(scale)
                        scaled=scaled.append(M)

                    y_val = y_val.apply(scale)
                    scaled.columns=y_tr.columns[:-1]
                    y_val.columns=y_tr.columns[:-1]
                    y_tr = scaled

                else:
                    y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
                    y_tr = y_tr.apply(scale)
                    y_val = y_val.apply(scale)
            else:
                if args.yval is not None:
                    y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
                    y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
                    y_val = y_val.apply(scale)
                    y_tr = y_tr.apply(scale)
                else:
                    y_tr = pd.read_csv(args.ytr).apply(pd.to_numeric)
                    y_tr = y_tr.apply(scale)
                    y_val = y_val.apply(scale)
        else:
            if args.yval is not None:
                y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)
                y_val = pd.read_csv(args.yval,sep='\s+').apply(pd.to_numeric)
            else:
                y_tr = pd.read_csv(args.ytr,sep='\s+').apply(pd.to_numeric)

        if args.trait is not None:
            trait= args.trait
            loc=y_tr.columns.get_loc(trait)
            y_tr=y_tr.iloc[:,loc]
            y_val=y_val.iloc[:,loc]

        else:
            trait=None
        y_tr = np.asarray(y_tr)
        y_val = np.asarray(y_val)
    else:
        if args.yval is not None:
            y_tr = pd.read_csv(args.ytr,sep='\s+')
            y_val = pd.read_csv(args.yval,sep='\s+')
        else:
            y_tr = pd.read_csv(args.ytr,sep='\s+')
        if args.trait is not None:
            trait = args.trait
            loc = y_tr.columns.get_loc(trait)
            y_tr = y_tr.iloc[:, loc]
            y_val = y_val.iloc[:, loc]
            y_tr = np.asarray(y_tr)
            y_val = np.asarray(y_val)
            encoder = LabelEncoder()
            encoder.fit(y_tr)
            y_tr = encoder.transform(y_tr)
            y_tr = to_categorical(y_tr)
            encoder = LabelEncoder()
            encoder.fit(y_val)
            y_val = encoder.transform(y_val)
            y_val=to_categorical(y_val)
        else:
            if len(y_tr.columns) > 1:
                sys.exit("To categorical Values,Only one trait is allowed ")
            else:
                y_tr = np.asarray(y_tr)
                y_val = np.asarray(y_val)
                encoder = LabelEncoder()
                encoder.fit(y_tr)
                y_tr = encoder.transform(y_tr)
                y_tr = to_categorical(y_tr)
                encoder = LabelEncoder()
                encoder.fit(y_val)
                y_val = encoder.transform(y_val)
                y_val = to_categorical(y_val)

    X_vl=np.asarray(X_vl)
    X_tr=np.asarray(X_tr)

    main=os.getcwd()
    diro=args.output

###to cat


    if args.cnn:
        if trait is not None:
            run_cnn_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs)
        else:
            run_cnn_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,reg_3=args.reg3,nconv=args.nconv,act_1=args.act1,act_2=args.act2,hn=args.hn,ps=args.ps,hl=args.hl,ks=args.ks,ns=args.ns,epochs=args.epochs,nf=args.nfilters,op=args.optimizer,bs=args.bs)
    if args.mlp:
        if trait is not None:
            run_mlp_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main,prop=args.gridProp,trait=trait,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_neurons_FL)
        else:
            run_mlp_main(X_tr, y_tr, X_vl, y_val, output=diro, main=main, prop=args.gridProp,trait=None,cat=args.categorical,lr=args.lr,dr_1=args.dr1,dr_2=args.dr2,reg_1=args.reg1,reg_2=args.reg2,act_1=args.act1,hn=args.hn,hl=args.hl,epochs=args.epochs,op=args.optimizer,bs=args.bs,N_neurons_FL=args.N_Neurons_FL)
    if args.ridge:
        if args.categorical is True:
            sys.exit("ridge only allowing to continous outputs")
        else:
            run_ridge_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main)
    if args.lasso:

        if args.categorical is True:
            sys.exit("ridge only allowing to continous outputs")
        else:
            run_lasso_main(X_tr,y_tr,X_vl,y_val,output=diro,main=main)