[befbfc]: / research_paper_code / notebooks / map_synth.ipynb

Download this file

294 lines (293 with data), 11.2 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "22d22050",
   "metadata": {},
   "source": [
    "# Experiment\n",
    "This notebook generates the experiment results from output using both GEMMA and linear regression in the directory output. The complete list of SNP-s and the p-value computed is generated for each algorithm and a graphical representation of the SNP-s is outputted as PNG.\n",
    "\n",
    "This notebook is similar to map.ipynb but specifically meant to be run on the new synthetic genome/phenome data."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4ba8943",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Ensure R kernel is installed. For a fresh install, un-comment and run this cell to \n",
    "# install R packages.\n",
    "\n",
    "install.packages(\"qtl\", repos = \"http://cran.us.r-project.org\")\n",
    "install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
    "install.packages(\"data.table\", repos = \"http://cran.us.r-project.org\")\n",
    "install.packages(\"stringr\", repos = \"http://cran.us.r-project.org\")\n",
    "install.packages(\"qqman\", repos = \"http://cran.us.r-project.org\")\n",
    "install.packages(\"devtools\", repos = \"http://cran.us.r-project.org\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39ad6796",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Set up working directory structures\n",
    "\n",
    "library(stringr)\n",
    "base_dir        <- str_replace(getwd(), 'research_paper_code/notebooks', '')\n",
    "r_base          <- \"research_paper_code\"\n",
    "experiment_dir  <- \"mice_data_set\" \n",
    "real_gwas_path = paste(base_dir, \"/mice_data_set/out\", sep=\"\")\n",
    "\n",
    "setwd(base_dir)\n",
    "getwd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3bd2a97b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Decide which phenotype you'll be analyzing\n",
    "phenotype_choice = \"abBMD\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dd44e82c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Map QTLs for phenotypes measured in CFW outbred mice using the linear\n",
    "# mixed model (LMM) analysis implemented in GEMMA.\n",
    "library(qtl)\n",
    "library(data.table)\n",
    "library(qqman)\n",
    "source(paste(r_base, \"/src/misc.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/gemma.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/read.data.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/data.manip.R\", sep=\"\"))\n",
    "source(paste(r_base, \"/src/qtl.analyses.R\", sep=\"\"))\n",
    "analysis_selection = analyses[\"abBMD\"]\n",
    "for (analysis in analysis_selection) {\n",
    "    print(analysis)\n",
    "}\n",
    "\n",
    "# SCRIPT PARAMETERS\n",
    "# -----------------\n",
    "chromosomes    <- NULL\n",
    "gemmadir       <- paste(experiment_dir, \"/gemma\", sep=\"\")\n",
    "gemma.exe      <- paste(\"./\", \"gemma-0.98.4-linux-static-AMD64\", sep=\"\")\n",
    "geno_txt_base       <- paste(experiment_dir, \"/data/synthetic_genome_data\", sep=\"\")\n",
    "map_txt_base       <- paste(experiment_dir, \"/data/genome_map_data\", sep=\"\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e5cefa75",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read in the synthetic phenotype data\n",
    "pheno_synth_file <- paste(experiment_dir, \"/data/phenome_alldata_synth.csv\", sep=\"\")\n",
    "pheno_all <- read.csv(pheno_synth_file,quote = \"\",header = TRUE,check.names = FALSE,\n",
    "                    stringsAsFactors = FALSE,comment.char = \"#\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "1360e25f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# This is the main gwas function\n",
    "\n",
    "run_gwas <- function(geno_txt, map_txt, pheno_all, analysis, batch) {\n",
    "\n",
    "    # LOAD GENOTYPE DATA\n",
    "    # ------------------\n",
    "    # Load the \"mean genotypes\"; i.e., the the mean alternative allele\n",
    "    # counts.\n",
    "    map     <- read.map(map_txt)\n",
    "    out     <- read.geno.dosage(geno_txt,nrow(map))\n",
    "    discard <- out$discard\n",
    "    X_all   <- out$geno\n",
    "    rm(out)\n",
    "\n",
    "    # Discard genotype samples from mislabeled flowcell samples.\n",
    "    X_all <- X_all[which(discard == \"no\"),]\n",
    "\n",
    "\n",
    "    # Discard SNPs with low \"imputation quality\" assessed by inspecting\n",
    "    # the genotype probabilities. Retain SNPs for which: (1) at least 95%\n",
    "    # of the samples have a maximum probability genotype greater than than\n",
    "    # 0.5; (2) the minor allele frequency is greater than 2%.\n",
    "    f       <- apply(X_all,2,compute.maf)\n",
    "    markers <- which(map$quality > 0.95 & f > 0.02)\n",
    "    map     <- map[markers,]\n",
    "    X_all   <- X_all[,markers]\n",
    "    \n",
    "    # min_var, max_var - which of the columns in genotype data (X_all above) to be considered when \n",
    "    # using linear models (mostly for speed). Please note that gemma analysis will analyze the \n",
    "    # whole chromosome\n",
    "    min_var = 1\n",
    "    max_var = dim(X_all)[2]\n",
    "\n",
    "    analysis_selection <- analyses[phenotype_choice]\n",
    "    chromosomes <- unique(map[min_var:max_var,\"chr\"])\n",
    "    \n",
    "    ##################################\n",
    "    # Cleanup data\n",
    "    phenotype  <- analysis$pheno\n",
    "    covariates <- analysis$cov\n",
    "    outliers   <- analysis$outliers\n",
    " \n",
    "    pheno <- copy(pheno_all)\n",
    "    if (!is.null(outliers))\n",
    "      pheno_all <- remove.outliers(pheno,phenotype,covariates,outliers)\n",
    "\n",
    "    \n",
    "    # Only analyze samples (i.e. rows of the genotype and phenotype\n",
    "    # matrices) for which the phenotype and all the covariates are\n",
    "    # observed.\n",
    "    pheno <- pheno[which(none.missing.row(pheno[c(phenotype,covariates)])),]  \n",
    "     \n",
    "    # Align the phenotypes and genotypes\n",
    "    ids   <- intersect(pheno_all$id,rownames(X_all))\n",
    "    pheno <- pheno_all[match(ids,pheno_all$id),]\n",
    "    X     <- X_all[match(ids,rownames(X_all)),]\n",
    "\n",
    "    ###################################\n",
    "    # Compute using gemma\n",
    "    # MAP QTLs\n",
    "    \n",
    "    \n",
    "    ge_out_dat <- paste(experiment_dir, \"/out_synth/ge_batch\", batch, \"_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".dat\", sep=\"\")\n",
    "    ge_out_csv <- paste(experiment_dir, \"/out_synth/ge_batch\", batch, \"_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\", sep=\"\")\n",
    "    \n",
    "    if (!file.exists(ge_out_csv)) {\n",
    "      # Calculate p-values using GEMMA.\n",
    "        gwscan.gemma <- run.gemma(phenotype,covariates,pheno,X,map,\n",
    "                                  gemmadir,gemma.exe,chromosomes)\n",
    "\n",
    "        # Save results to file.\n",
    "        save(list = c(\"analysis\",\"gwscan.gemma\"),file = ge_out_dat)\n",
    "        \n",
    "        named_gws <- gwscan.gemma\n",
    "        named_gws$snp = rownames(named_gws)\n",
    "        named_gws$p = 10 ^ (-named_gws$log10p)\n",
    "               \n",
    "        write.csv(data.table(named_gws)[order(rank(p)),], ge_out_csv)\n",
    "        \n",
    "    }\n",
    "    \n",
    "    ###################################\n",
    "    # Compute using linear model\n",
    "    lm_out_csv <- paste(experiment_dir, \"/out_synth/lm_batch\", batch, \"_\", analysis$pheno, \"_\", min_var, \"_\", max_var, \".csv\",sep=\"\")\n",
    "    \n",
    "    if(!file.exists(lm_out_csv)) {\n",
    "        print(dim(X)[2])\n",
    "        dt <- data.table(snp=rep(\"\",dim(X)[2]), chr=rep(0,dim(X)[2]), pos=rep(0,dim(X)[2]), p=rep(1,dim(X)[2]))\n",
    "        for (i in min_var:max_var) {\n",
    "            X_variant <- cbind(X[,i], pheno_column=pheno[,analysis$pheno])\n",
    "            colnames(X_variant)[1]<-colnames(X)[i]\n",
    "            f <- paste(\"pheno_column ~ \",colnames(X)[i])\n",
    "            # Add any covariates\n",
    "            for(cov in analysis$cov) {\n",
    "                X_variant <- cbind(X_variant, pheno_column=pheno[,cov])\n",
    "                f <- paste(f,\"+\",cov)\n",
    "            }\n",
    "            res_variant <- lm(pheno_column~., data = data.frame(X_variant))\n",
    "          \n",
    "            dt[i,1] = colnames(X)[i]\n",
    "            dt[i,2] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"chr\"])\n",
    "            dt[i,3] = as.numeric(map[map[\"id\"]==colnames(X)[i],\"pos\"])\n",
    "            dt[i,4] = as.numeric(summary(res_variant)$coefficients[2,4])\n",
    "        }\n",
    "        \n",
    "        # Print to file\n",
    "\n",
    "        write.csv(dt[order(rank(p)),][1:(max_var-min_var)], lm_out_csv)\n",
    "        print(paste(\"Sorted p-values saved in: \", lm_out_csv, sep=\"\"))\n",
    "\n",
    "    }\n",
    "\n",
    "    \n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5e3d381",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Now you can run GWAS for a particular synthetic genome batch\n",
    "# Be sure the contents of out_synth doesn't already have output for this batch or it won't rerun\n",
    "# Restart the notebook for each call to run_gwas\n",
    "\n",
    "batch = 2316\n",
    "geno_txt       <- paste(geno_txt_base, \"/synthetic_genomes_batch\", batch, \".txt\", sep=\"\")\n",
    "map_txt       <- paste(map_txt_base, \"/map_abBMD_batch\", batch, \".txt\", sep=\"\")\n",
    "run_gwas(geno_txt, map_txt, pheno_all, analysis, batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aee40f73",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Alternatively, you can run gwas for a merged larger synthetic genome file\n",
    "# Be sure that any previous files created in out_synth for this pheno aren't still there\n",
    "# or new data will not be created\n",
    "# Restart the notebook for each call to run_gwas\n",
    "# NOTE - YOU MUST USE THE FILENAMES THAT YOU CREATED AT THE END OF THE 04 NOTEBOOK\n",
    "\n",
    "batch = \"all\"\n",
    "geno_txt       <- paste(geno_txt_base, \"/synthetic_genomes_allbatches.txt\", sep=\"\")\n",
    "map_txt       <- paste(map_txt_base, \"/map_abBMD_allbatches.txt\", sep=\"\")\n",
    "run_gwas(geno_txt, map_txt, pheno_all, analysis, batch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "31f34dc0",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "4.1.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}