synthetic-data-genomics / Git / [befbfc] /research_paper_code/src/map_gwas

Models:
MarcoTheBlack/
synthetic-data-genomics
Downloads: 1
[befbfc]: / research_paper_code / src / map_gwas_batch.R
History
Download this file
187 lines (133 with data), 6.0 kB

#!/usr/bin/env Rscript

args = commandArgs(trailingOnly=TRUE)

# Set up working directory structures

base_dir <- ".."
print(base_dir)
r_base          <- "research_paper_code"
experiment_dir  <- "mice_data_set"

setwd(base_dir)


map <- function(){
    
    # Map QTLs for phenotypes measured in CFW outbred mice using the linear
    # mixed model (LMM) analysis implemented in GEMMA.

    # Set up working directory structures

    library(qtl)
    library(data.table)
    library(qqman)
    source(paste(r_base, "/src/misc.R", sep=""))
    source(paste(r_base, "/src/gemma.R", sep=""))
    source(paste(r_base, "/src/read.data.R", sep=""))
    source(paste(r_base, "/src/data.manip.R", sep=""))
    source(paste(r_base, "/src/qtl.analyses.R", sep=""))


    # SCRIPT PARAMETERS
    # -----------------
    chromosomes    <- NULL
    gemmadir       <- paste(experiment_dir, "/gemma", sep="")
    gemma.exe      <- paste("./", "gemma-0.98.4-linux-static-AMD64", sep="")
    geno_txt_base       <- paste(experiment_dir, "/data/synthetic_genomes", sep="")
    map_txt_base       <- paste(experiment_dir, "/data/map_", sep="")
    
    # Read in the synthetic phenotype data
    pheno_synth_file <- paste(experiment_dir, "/data/phenome_alldata_synth.csv", sep="")
    pheno_all <- read.csv(pheno_synth_file,quote = "",header = TRUE,check.names = FALSE,
                    stringsAsFactors = FALSE,comment.char = "#")
    
    # Read in the phenotypes to analyze
    
    filename = paste(experiment_dir, "/data/pheno_analysis.csv", sep="")
    #print(filename)
    pheno_analysis_df <- read.csv(filename,quote = "",header = TRUE,check.names = FALSE,
                    stringsAsFactors = FALSE,comment.char = "#")
    pheno_analysis <- c(pheno_analysis_df$pheno)
    print(pheno_analysis)

    geno_txt       <- paste(geno_txt_base,  ".txt", sep="")
    map_txt       <- paste(map_txt_base, "batch.txt", sep="")
    
    # Compute GWAS for each phenotye
    for(next_pheno in pheno_analysis) {         
        print("Proccessing pheno")
        print(next_pheno)

        if (next_pheno == "testisweight") {
            next_pheno <- "testis"
        }
        if (next_pheno == "D2TOTDIST25") {
            next_pheno <- tolower(next_pheno)
        }
        
        analysis_selection = analyses[next_pheno]

        for (analysis in analysis_selection) {
            print(analysis)
        }

        pheno_all <- read.csv(pheno_synth_file,quote = "",header = TRUE,check.names = FALSE,
                    stringsAsFactors = FALSE,comment.char = "#")
        chromosomes    <- NULL
        run_gwas(geno_txt, map_txt, pheno_all, analysis, gemmadir, gemma.exe)
        
    }
    
}    


run_gwas <- function(geno_txt, map_txt, pheno_all, analysis, gemmadir, gemma.exe) {

    # LOAD GENOTYPE DATA
    # ------------------
    # Load the "mean genotypes"; i.e., the the mean alternative allele
    # counts.

    map     <- read.map(map_txt)

    out     <- read.geno.dosage(geno_txt,nrow(map))
    discard <- out$discard
    X_all   <- out$geno
    rm(out)

    # Discard genotype samples from mislabeled flowcell samples.
    X_all <- X_all[which(discard == "no"),]


    # Discard SNPs with low "imputation quality" assessed by inspecting
    # the genotype probabilities. Retain SNPs for which: (1) at least 95%
    # of the samples have a maximum probability genotype greater than than
    # 0.5; (2) the minor allele frequency is greater than 2%.
    f       <- apply(X_all,2,compute.maf)
    markers <- which(map$quality > 0.95 & f > 0.02)
    map     <- map[markers,]
    X_all   <- X_all[,markers]

    # min_var, max_var - which of the columns in genotype data (X_all above) to be considered when 
    # using linear models (mostly for speed). Please note that gemma analysis will analyze the 
    # whole chromosome
    min_var = 1
    max_var = dim(X_all)[2]

    chromosomes <- unique(map[min_var:max_var,"chr"])
    
    ##################################
    # Cleanup data
    phenotype  <- analysis$pheno
    covariates <- analysis$cov
    outliers   <- analysis$outliers
 
    # Skip removal of outliers for 'soleus' as there's a bug
    pheno <- copy(pheno_all)
    if (!is.null(outliers)) {
        if (analysis$pheno != 'soleus') {
            pheno_all <- remove.outliers(pheno,phenotype,covariates,outliers)
        }
    }
    
    # Only analyze samples (i.e. rows of the genotype and phenotype
    # matrices) for which the phenotype and all the covariates are
    # observed.
    pheno <- pheno[which(none.missing.row(pheno[c(phenotype,covariates)])),] 
     
    # Align the phenotypes and genotypes
    ids   <- intersect(pheno_all$id,rownames(X_all))
    pheno <- pheno_all[match(ids,pheno_all$id),]
    X     <- X_all[match(ids,rownames(X_all)),]

    # Compute linear model

    lm_out_csv <- paste(experiment_dir, "/out_synth_working/lm_", analysis$pheno, ".csv",sep="")

    overwrite = 1
    if (overwrite == 1) {    

        dt <- data.table(snp=rep("",dim(X)[2]), chr=rep(0,dim(X)[2]), pos=rep(0,dim(X)[2]), p=rep(1,dim(X)[2]))

        for (i in min_var:max_var) {
            X_variant <- cbind(X[,i], pheno_column=pheno[,analysis$pheno])
            colnames(X_variant)[1]<-colnames(X)[i]
            f <- paste("pheno_column ~ ",colnames(X)[i])
            # Add any covariates
            for(cov in analysis$cov) {
                X_variant <- cbind(X_variant, pheno_column=pheno[,cov])
                f <- paste(f,"+",cov)
            }

            pheno_column=pheno[,analysis$pheno]
            res_variant <- lm(pheno_column~., data = data.frame(X_variant))
 
            dt[i,1] = colnames(X)[i]

            dt[i,2] = as.numeric(map[map["id"]==colnames(X)[i],"chr"])

            dt[i,3] = as.numeric(map[map["id"]==colnames(X)[i],"pos"])

            dt[i,4] = as.numeric(summary(res_variant)$coefficients[2,4])

        }

        # Print to file
        #write.csv(dt[order(rank(p)),][1:(max_var-min_var)], lm_out_csv)
        write.csv(dt, lm_out_csv)

    }
    
   
    
}

map()